xref: /aosp_15_r20/external/executorch/examples/models/phi-3-mini/runner.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <executorch/examples/models/phi-3-mini/runner.h>
10 
11 #include <ctime>
12 #include <iostream>
13 
14 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
15 #include <executorch/extension/tensor/tensor.h>
16 #include <executorch/runtime/platform/log.h>
17 
18 using executorch::aten::ScalarType;
19 using executorch::extension::Module;
20 using executorch::extension::llm::BPETokenizer;
21 using executorch::extension::llm::Sampler;
22 using executorch::runtime::Error;
23 
24 namespace example {
25 
26 #define SAMPLER_TOP 0.9f
27 #define ENDOFTEXT_TOKEN 32000
28 #define VOCABULARY_SIZE 32064
29 
Runner(const std::string & model_path,const std::string & tokenizer_path,const float temperature)30 Runner::Runner(
31     const std::string& model_path,
32     const std::string& tokenizer_path,
33     const float temperature)
34     : module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
35       tokenizer_(std::make_unique<BPETokenizer>()),
36       sampler_(std::make_unique<Sampler>(
37           VOCABULARY_SIZE,
38           temperature,
39           SAMPLER_TOP,
40           static_cast<unsigned long long>(std::time(nullptr)))) {
41   ET_CHECK_MSG(
42       tokenizer_->load(tokenizer_path) == Error::Ok,
43       "Failed to load tokenizer at %s",
44       tokenizer_path.c_str());
45   ET_LOG(
46       Info,
47       "Created Phi-3-mini runner: model_path=%s, tokenizer_path=%s",
48       model_path.c_str(),
49       tokenizer_path.c_str());
50 }
51 
generate(const std::string & prompt,std::size_t max_seq_len)52 void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
53   auto encode_res = tokenizer_->encode(prompt, 0, 0);
54   ET_CHECK_MSG(
55       encode_res.error() == Error::Ok, "Failed to encode %s", prompt.c_str());
56   auto input_tokens = encode_res.get();
57   auto prev_token = input_tokens.back();
58   auto current_token = prefill(input_tokens);
59   std::cout << tokenizer_->decode(prev_token, current_token).get();
60   std::cout.flush();
61 
62   std::size_t seq_len = input_tokens.size() + 1;
63 
64   while (current_token != ENDOFTEXT_TOKEN && seq_len < max_seq_len) {
65     prev_token = current_token;
66     current_token = run_model_step(current_token);
67     std::cout << tokenizer_->decode(prev_token, current_token).get();
68     std::cout.flush();
69 
70     ++seq_len;
71   }
72 
73   std::cout << std::endl;
74 }
75 
logits_to_token(const exec_aten::Tensor & logits_tensor)76 uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) {
77   return sampler_->sample(logits_tensor.data_ptr<float>());
78 }
79 
prefill(std::vector<uint64_t> & tokens)80 uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
81   auto result = module_->forward(executorch::extension::from_blob(
82       tokens.data(),
83       {1, static_cast<exec_aten::SizesType>(tokens.size())},
84       ScalarType::Long));
85   ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens");
86 
87   return logits_to_token(result.get()[0].toTensor());
88 }
89 
run_model_step(uint64_t token)90 uint64_t Runner::run_model_step(uint64_t token) {
91   auto result = module_->forward(
92       executorch::extension::from_blob(&token, {1, 1}, ScalarType::Long));
93   ET_CHECK_MSG(
94       result.error() == Error::Ok,
95       "Failed to run forward() for token %" PRIu64,
96       token);
97 
98   return logits_to_token(result.get()[0].toTensor());
99 }
100 
101 } // namespace example
102