1 /* 2 * Copyright (c) Qualcomm Innovation Center, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under the BSD-style license found in the 6 * LICENSE file in the root directory of this source tree. 7 */ 8 9 // A simple llama2 runner that includes preprocessing and post processing logic. 10 // The module takes in a string as input and emits a string as output. 11 12 #pragma once 13 14 #include <cstdint> 15 #include <functional> 16 #include <memory> 17 #include <string> 18 #include <unordered_map> 19 20 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h> 21 #include <executorch/extension/llm/sampler/sampler.h> 22 #include <executorch/extension/llm/tokenizer/tokenizer.h> 23 #include <executorch/extension/module/module.h> 24 #include <executorch/extension/tensor/tensor.h> 25 26 class RpcMemAllocator { 27 public: RpcMemAllocator(QnnMemDescriptor shared_buffer_type)28 RpcMemAllocator(QnnMemDescriptor shared_buffer_type) 29 : shared_buffer_type_(shared_buffer_type){}; allocate(size_t bytes,size_t alignment)30 bool allocate(size_t bytes, size_t alignment) { 31 ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment); 32 if (ptr_ == nullptr) { 33 ET_LOG( 34 Info, 35 "Allocate Rpc mem falied, fallback to nromal ptr: bytes=%zu, alignment=%zu", 36 bytes, 37 alignment); 38 input_data_.resize(bytes); 39 ptr_ = input_data_.data(); 40 } 41 return ptr_ != nullptr; 42 } 43 ~RpcMemAllocator()44 ~RpcMemAllocator() { 45 if (shared_buffer_type_ == QnnMemDescriptor::kIon || 46 shared_buffer_type_ == QnnMemDescriptor::kCustom) { 47 if (ptr_ != nullptr) { 48 QnnExecuTorchFreeCustomMem(ptr_); 49 } 50 } 51 } 52 GetPtr()53 void* GetPtr() { 54 return ptr_; 55 } 56 57 private: 58 QnnMemDescriptor shared_buffer_type_; 59 void* ptr_{nullptr}; 60 std::vector<char> input_data_; 61 std::vector<size_t> tensor_base_addrs_; 62 }; 63 64 #define DEFINE_IOMEMMGR_ACCESSOR(name) \ 65 size_t get_##name##_pos() const { \ 66 return name##_pos_; \ 67 } \ 68 char* get_##name##_ptr() const { \ 69 return reinterpret_cast<char*>(ptr_) + name##_pos_; \ 70 } \ 71 char* set_##name##_ptr() { \ 72 CustomMemTensorInfo info = { \ 73 ptr_, \ 74 ptr_ + name##_pos_, \ 75 name##_pos_, \ 76 io_info_.name.size, \ 77 io_info_.name.shape.data(), \ 78 io_info_.name.rank, \ 79 io_info_.name.dtype}; \ 80 QnnExecuTorchAddCustomMemTensorInfo(info); \ 81 return reinterpret_cast<char*>(ptr_) + name##_pos_; \ 82 } 83 84 #define DEFINE_IOMEMMGR_VEC_ACCESSOR(name) \ 85 const std::vector<size_t>& get_##name##_pos_vec() const { \ 86 return name##_pos_; \ 87 } \ 88 char* get_##name##_ptr(int idx) { \ 89 return ptr_ + name##_pos_[idx]; \ 90 } \ 91 char* set_##name(int idx, size_t pos) { \ 92 name##_pos_[idx] = pos; \ 93 CustomMemTensorInfo info = { \ 94 ptr_, \ 95 ptr_ + name##_pos_[idx], \ 96 name##_pos_[idx], \ 97 io_info_.name.size, \ 98 io_info_.name.shape.data(), \ 99 io_info_.name.rank, \ 100 io_info_.name.dtype}; \ 101 QnnExecuTorchAddCustomMemTensorInfo(info); \ 102 return reinterpret_cast<char*>(ptr_) + pos; \ 103 } \ 104 char* update_##name(int idx, size_t shift_size) { \ 105 name##_pos_[idx] += shift_size; \ 106 return reinterpret_cast<char*>(ptr_) + name##_pos_[idx]; \ 107 } 108 109 namespace example { 110 class IoMemMgr { 111 public: 112 // Allocate a big memory which is capable to contain all IO of all modules IoMemMgr()113 IoMemMgr(){}; 114 IoMemMgr(executorch::runtime::MethodMeta method_meta); 115 116 struct InfoAttrs { 117 std::unique_ptr<executorch::runtime::TensorInfo> tensor_meta; 118 size_t size = 0; 119 std::vector<uint32_t> shape; 120 uint32_t rank; 121 size_t element_size; 122 executorch::aten::ScalarType dtype; 123 }; 124 125 struct IoInfo { 126 InfoAttrs input_token; 127 InfoAttrs pos_idx; 128 InfoAttrs atten_mask; 129 InfoAttrs k_caches_read; 130 InfoAttrs k_caches_write; 131 InfoAttrs v_caches_read; 132 InfoAttrs v_caches_write; 133 InfoAttrs logit; 134 std::vector<InfoAttrs*> tensor_info{ 135 &input_token, 136 &pos_idx, 137 &atten_mask, 138 &k_caches_read, 139 &k_caches_write, 140 &v_caches_read, 141 &v_caches_write, 142 &logit, 143 }; 144 }; 145 allocate(size_t alignment)146 bool allocate(size_t alignment) { 147 bool ret = rpc_mem_allocator.allocate(total_nbytes_, alignment); 148 ptr_ = reinterpret_cast<char*>(rpc_mem_allocator.GetPtr()); 149 return ret; 150 } 151 bool init_tensors(); 152 get_custom_mem_ptr()153 char* get_custom_mem_ptr() { 154 return ptr_; 155 } 156 157 // Pointers of k cache read, v cache read and write are shifted every step. 158 // Set them first to register mem handle during qnn delegation init. 159 void set_all_shifted_ptrs(size_t max_seq_len); 160 161 DEFINE_IOMEMMGR_ACCESSOR(atten_mask); 162 DEFINE_IOMEMMGR_ACCESSOR(input_token); 163 DEFINE_IOMEMMGR_ACCESSOR(pos_idx); 164 DEFINE_IOMEMMGR_ACCESSOR(logit); 165 166 DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_read); 167 DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_write); 168 DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_read); 169 DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_write); 170 171 private: 172 size_t total_nbytes_{0}; 173 char* ptr_{nullptr}; 174 void compute_total_nbytes(); 175 void set_tensor_meta(); 176 void init_io_info(); 177 178 size_t atten_mask_pos_; 179 size_t input_token_pos_{0}; 180 size_t logit_pos_; 181 size_t pos_idx_pos_; 182 std::vector<size_t> k_caches_read_pos_; 183 std::vector<size_t> k_caches_write_pos_; 184 std::vector<size_t> v_caches_read_pos_; 185 std::vector<size_t> v_caches_write_pos_; 186 187 IoInfo io_info_; 188 std::unique_ptr<executorch::runtime::MethodMeta> method_meta_; 189 RpcMemAllocator rpc_mem_allocator{QnnMemDescriptor::kCustom}; 190 std::unordered_map<executorch::aten::ScalarType, size_t> scalar_type_to_size = 191 { 192 {executorch::aten::ScalarType::Int, sizeof(int32_t)}, 193 {executorch::aten::ScalarType::Float, sizeof(float)}, 194 {executorch::aten::ScalarType::Char, sizeof(int8_t)}, 195 {executorch::aten::ScalarType::Short, sizeof(int16_t)}, 196 {executorch::aten::ScalarType::Byte, sizeof(uint8_t)}, 197 {executorch::aten::ScalarType::Bits16, sizeof(uint16_t)}, 198 }; 199 }; 200 201 class Runner { 202 public: 203 explicit Runner( 204 const std::string& model_path, 205 const std::string& tokenizer_path, 206 const float temperature = 0.8f); 207 208 struct Stats { 209 // Scaling factor for timestamps - in this case, we use ms. 210 const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; 211 // Time stamps for the different stages of the execution 212 // model_load_start_ms: Start of model loading. 213 long model_load_start_ms; 214 // model_load_end_ms: End of model loading. 215 long model_load_end_ms; 216 // inference_start_ms: Immediately after the model is loaded (or we check 217 // for model load), measure the inference time. 218 long inference_start_ms; 219 // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right 220 // before the inference loop starts 221 long prompt_eval_end_ms; 222 // first_token: Timestamp when the first generated token is emitted 223 long first_token_ms; 224 // inference_end_ms: End of inference/generation. 225 long inference_end_ms; 226 // Keep a running total of the time spent in sampling. 227 long aggregate_sampling_time_ms; 228 // Token count from prompt 229 int64_t num_prompt_tokens; 230 // Token count from generated (total - prompt) 231 int64_t num_generated_tokens; 232 }; 233 234 bool is_loaded() const; 235 executorch::runtime::Error load(); 236 executorch::runtime::Error mem_alloc(size_t alignment, size_t seq_len); 237 executorch::runtime::Error generate( 238 const std::string& prompt, 239 int32_t seq_len, 240 std::function<void(const std::string&)> token_callback = {}, 241 std::function<void(const Stats&)> stats_callback = {}); 242 void stop(); 243 executorch::runtime::Result<executorch::runtime::MethodMeta> 244 get_method_meta(); 245 246 private: 247 // metadata 248 template <typename T> 249 T getMetadataHelper(std::string method_name, T default_val); 250 template <typename T> 251 int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor); 252 executorch::runtime::Result<executorch::aten::Tensor> run_model_step( 253 int64_t input_token, 254 ::executorch::extension::TensorPtr& token, 255 ::executorch::extension::TensorPtr& start_pos, 256 ::executorch::extension::TensorPtr& atten_mask, 257 std::vector<::executorch::extension::TensorPtr>& kv_tensors, 258 std::vector<::executorch::extension::TensorPtr>& kv_outputs); 259 // metadata 260 int32_t vocab_size_; 261 int64_t bos_id_; 262 int64_t eos_id_; 263 int32_t n_bos_; 264 int32_t n_eos_; 265 int32_t max_seq_len_; 266 int32_t head_dim_; 267 int32_t dim_; 268 std::unordered_set<std::string> model_methods_; 269 std::unique_ptr<executorch::extension::Module> module_; 270 std::string tokenizer_path_; 271 std::string model_path_; 272 float temperature_; 273 std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_; 274 std::unique_ptr<executorch::extension::llm::Sampler> sampler_; 275 bool shouldStop_{false}; 276 Stats stats_; 277 IoMemMgr io_mem_mgr_; 278 }; 279 280 } // namespace example 281