xref: /aosp_15_r20/external/executorch/examples/qualcomm/oss_scripts/llama2/runner/runner.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Qualcomm Innovation Center, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // A simple llama2 runner that includes preprocessing and post processing logic.
10 // The module takes in a string as input and emits a string as output.
11 
12 #pragma once
13 
14 #include <cstdint>
15 #include <functional>
16 #include <memory>
17 #include <string>
18 #include <unordered_map>
19 
20 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
21 #include <executorch/extension/llm/sampler/sampler.h>
22 #include <executorch/extension/llm/tokenizer/tokenizer.h>
23 #include <executorch/extension/module/module.h>
24 #include <executorch/extension/tensor/tensor.h>
25 
26 class RpcMemAllocator {
27  public:
RpcMemAllocator(QnnMemDescriptor shared_buffer_type)28   RpcMemAllocator(QnnMemDescriptor shared_buffer_type)
29       : shared_buffer_type_(shared_buffer_type){};
allocate(size_t bytes,size_t alignment)30   bool allocate(size_t bytes, size_t alignment) {
31     ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment);
32     if (ptr_ == nullptr) {
33       ET_LOG(
34           Info,
35           "Allocate Rpc mem falied, fallback to nromal ptr: bytes=%zu, alignment=%zu",
36           bytes,
37           alignment);
38       input_data_.resize(bytes);
39       ptr_ = input_data_.data();
40     }
41     return ptr_ != nullptr;
42   }
43 
~RpcMemAllocator()44   ~RpcMemAllocator() {
45     if (shared_buffer_type_ == QnnMemDescriptor::kIon ||
46         shared_buffer_type_ == QnnMemDescriptor::kCustom) {
47       if (ptr_ != nullptr) {
48         QnnExecuTorchFreeCustomMem(ptr_);
49       }
50     }
51   }
52 
GetPtr()53   void* GetPtr() {
54     return ptr_;
55   }
56 
57  private:
58   QnnMemDescriptor shared_buffer_type_;
59   void* ptr_{nullptr};
60   std::vector<char> input_data_;
61   std::vector<size_t> tensor_base_addrs_;
62 };
63 
64 #define DEFINE_IOMEMMGR_ACCESSOR(name)                  \
65   size_t get_##name##_pos() const {                     \
66     return name##_pos_;                                 \
67   }                                                     \
68   char* get_##name##_ptr() const {                      \
69     return reinterpret_cast<char*>(ptr_) + name##_pos_; \
70   }                                                     \
71   char* set_##name##_ptr() {                            \
72     CustomMemTensorInfo info = {                        \
73         ptr_,                                           \
74         ptr_ + name##_pos_,                             \
75         name##_pos_,                                    \
76         io_info_.name.size,                             \
77         io_info_.name.shape.data(),                     \
78         io_info_.name.rank,                             \
79         io_info_.name.dtype};                           \
80     QnnExecuTorchAddCustomMemTensorInfo(info);          \
81     return reinterpret_cast<char*>(ptr_) + name##_pos_; \
82   }
83 
84 #define DEFINE_IOMEMMGR_VEC_ACCESSOR(name)                   \
85   const std::vector<size_t>& get_##name##_pos_vec() const {  \
86     return name##_pos_;                                      \
87   }                                                          \
88   char* get_##name##_ptr(int idx) {                          \
89     return ptr_ + name##_pos_[idx];                          \
90   }                                                          \
91   char* set_##name(int idx, size_t pos) {                    \
92     name##_pos_[idx] = pos;                                  \
93     CustomMemTensorInfo info = {                             \
94         ptr_,                                                \
95         ptr_ + name##_pos_[idx],                             \
96         name##_pos_[idx],                                    \
97         io_info_.name.size,                                  \
98         io_info_.name.shape.data(),                          \
99         io_info_.name.rank,                                  \
100         io_info_.name.dtype};                                \
101     QnnExecuTorchAddCustomMemTensorInfo(info);               \
102     return reinterpret_cast<char*>(ptr_) + pos;              \
103   }                                                          \
104   char* update_##name(int idx, size_t shift_size) {          \
105     name##_pos_[idx] += shift_size;                          \
106     return reinterpret_cast<char*>(ptr_) + name##_pos_[idx]; \
107   }
108 
109 namespace example {
110 class IoMemMgr {
111  public:
112   // Allocate a big memory which is capable to contain all IO of all modules
IoMemMgr()113   IoMemMgr(){};
114   IoMemMgr(executorch::runtime::MethodMeta method_meta);
115 
116   struct InfoAttrs {
117     std::unique_ptr<executorch::runtime::TensorInfo> tensor_meta;
118     size_t size = 0;
119     std::vector<uint32_t> shape;
120     uint32_t rank;
121     size_t element_size;
122     executorch::aten::ScalarType dtype;
123   };
124 
125   struct IoInfo {
126     InfoAttrs input_token;
127     InfoAttrs pos_idx;
128     InfoAttrs atten_mask;
129     InfoAttrs k_caches_read;
130     InfoAttrs k_caches_write;
131     InfoAttrs v_caches_read;
132     InfoAttrs v_caches_write;
133     InfoAttrs logit;
134     std::vector<InfoAttrs*> tensor_info{
135         &input_token,
136         &pos_idx,
137         &atten_mask,
138         &k_caches_read,
139         &k_caches_write,
140         &v_caches_read,
141         &v_caches_write,
142         &logit,
143     };
144   };
145 
allocate(size_t alignment)146   bool allocate(size_t alignment) {
147     bool ret = rpc_mem_allocator.allocate(total_nbytes_, alignment);
148     ptr_ = reinterpret_cast<char*>(rpc_mem_allocator.GetPtr());
149     return ret;
150   }
151   bool init_tensors();
152 
get_custom_mem_ptr()153   char* get_custom_mem_ptr() {
154     return ptr_;
155   }
156 
157   // Pointers of k cache read, v cache read and write are shifted every step.
158   // Set them first to register mem handle during qnn delegation init.
159   void set_all_shifted_ptrs(size_t max_seq_len);
160 
161   DEFINE_IOMEMMGR_ACCESSOR(atten_mask);
162   DEFINE_IOMEMMGR_ACCESSOR(input_token);
163   DEFINE_IOMEMMGR_ACCESSOR(pos_idx);
164   DEFINE_IOMEMMGR_ACCESSOR(logit);
165 
166   DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_read);
167   DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_write);
168   DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_read);
169   DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_write);
170 
171  private:
172   size_t total_nbytes_{0};
173   char* ptr_{nullptr};
174   void compute_total_nbytes();
175   void set_tensor_meta();
176   void init_io_info();
177 
178   size_t atten_mask_pos_;
179   size_t input_token_pos_{0};
180   size_t logit_pos_;
181   size_t pos_idx_pos_;
182   std::vector<size_t> k_caches_read_pos_;
183   std::vector<size_t> k_caches_write_pos_;
184   std::vector<size_t> v_caches_read_pos_;
185   std::vector<size_t> v_caches_write_pos_;
186 
187   IoInfo io_info_;
188   std::unique_ptr<executorch::runtime::MethodMeta> method_meta_;
189   RpcMemAllocator rpc_mem_allocator{QnnMemDescriptor::kCustom};
190   std::unordered_map<executorch::aten::ScalarType, size_t> scalar_type_to_size =
191       {
192           {executorch::aten::ScalarType::Int, sizeof(int32_t)},
193           {executorch::aten::ScalarType::Float, sizeof(float)},
194           {executorch::aten::ScalarType::Char, sizeof(int8_t)},
195           {executorch::aten::ScalarType::Short, sizeof(int16_t)},
196           {executorch::aten::ScalarType::Byte, sizeof(uint8_t)},
197           {executorch::aten::ScalarType::Bits16, sizeof(uint16_t)},
198   };
199 };
200 
201 class Runner {
202  public:
203   explicit Runner(
204       const std::string& model_path,
205       const std::string& tokenizer_path,
206       const float temperature = 0.8f);
207 
208   struct Stats {
209     // Scaling factor for timestamps - in this case, we use ms.
210     const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
211     // Time stamps for the different stages of the execution
212     // model_load_start_ms: Start of model loading.
213     long model_load_start_ms;
214     // model_load_end_ms: End of model loading.
215     long model_load_end_ms;
216     // inference_start_ms: Immediately after the model is loaded (or we check
217     // for model load), measure the inference time.
218     long inference_start_ms;
219     // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
220     // before the inference loop starts
221     long prompt_eval_end_ms;
222     // first_token: Timestamp when the first generated token is emitted
223     long first_token_ms;
224     // inference_end_ms: End of inference/generation.
225     long inference_end_ms;
226     // Keep a running total of the time spent in sampling.
227     long aggregate_sampling_time_ms;
228     // Token count from prompt
229     int64_t num_prompt_tokens;
230     // Token count from generated (total - prompt)
231     int64_t num_generated_tokens;
232   };
233 
234   bool is_loaded() const;
235   executorch::runtime::Error load();
236   executorch::runtime::Error mem_alloc(size_t alignment, size_t seq_len);
237   executorch::runtime::Error generate(
238       const std::string& prompt,
239       int32_t seq_len,
240       std::function<void(const std::string&)> token_callback = {},
241       std::function<void(const Stats&)> stats_callback = {});
242   void stop();
243   executorch::runtime::Result<executorch::runtime::MethodMeta>
244   get_method_meta();
245 
246  private:
247   // metadata
248   template <typename T>
249   T getMetadataHelper(std::string method_name, T default_val);
250   template <typename T>
251   int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor);
252   executorch::runtime::Result<executorch::aten::Tensor> run_model_step(
253       int64_t input_token,
254       ::executorch::extension::TensorPtr& token,
255       ::executorch::extension::TensorPtr& start_pos,
256       ::executorch::extension::TensorPtr& atten_mask,
257       std::vector<::executorch::extension::TensorPtr>& kv_tensors,
258       std::vector<::executorch::extension::TensorPtr>& kv_outputs);
259   // metadata
260   int32_t vocab_size_;
261   int64_t bos_id_;
262   int64_t eos_id_;
263   int32_t n_bos_;
264   int32_t n_eos_;
265   int32_t max_seq_len_;
266   int32_t head_dim_;
267   int32_t dim_;
268   std::unordered_set<std::string> model_methods_;
269   std::unique_ptr<executorch::extension::Module> module_;
270   std::string tokenizer_path_;
271   std::string model_path_;
272   float temperature_;
273   std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
274   std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
275   bool shouldStop_{false};
276   Stats stats_;
277   IoMemMgr io_mem_mgr_;
278 };
279 
280 } // namespace example
281