1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <cstdint>
10
11 #include "basic_sampler.h"
12 #include "basic_tokenizer.h"
13
14 #include <executorch/extension/module/module.h>
15 #include <executorch/extension/tensor/tensor.h>
16 #include <executorch/runtime/core/evalue.h>
17 #include <executorch/runtime/core/exec_aten/exec_aten.h>
18 #include <executorch/runtime/core/result.h>
19
20 using executorch::aten::ScalarType;
21 using executorch::aten::Tensor;
22 using executorch::extension::from_blob;
23 using executorch::extension::Module;
24 using executorch::runtime::EValue;
25 using executorch::runtime::Result;
26
27 // The value of the gpt2 `<|endoftext|>` token.
28 #define ENDOFTEXT_TOKEN 50256
29
generate(Module & llm_model,std::string & prompt,BasicTokenizer & tokenizer,BasicSampler & sampler,size_t max_input_length,size_t max_output_length)30 std::string generate(
31 Module& llm_model,
32 std::string& prompt,
33 BasicTokenizer& tokenizer,
34 BasicSampler& sampler,
35 size_t max_input_length,
36 size_t max_output_length) {
37 // Convert the input text into a list of integers (tokens) that represents it,
38 // using the string-to-token mapping that the model was trained on. Each token
39 // is an integer that represents a word or part of a word.
40 std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
41 std::vector<int64_t> output_tokens;
42
43 for (auto i = 0u; i < max_output_length; i++) {
44 // Convert the input_tokens from a vector of int64_t to EValue. EValue is a
45 // unified data type in the ExecuTorch runtime.
46 auto inputs = from_blob(
47 input_tokens.data(),
48 {1, static_cast<int>(input_tokens.size())},
49 ScalarType::Long);
50
51 // Run the model. It will return a tensor of logits (log-probabilities).
52 auto logits_evalue = llm_model.forward(inputs);
53
54 // Convert the output logits from EValue to std::vector, which is what the
55 // sampler expects.
56 Tensor logits_tensor = logits_evalue.get()[0].toTensor();
57 std::vector<float> logits(
58 logits_tensor.data_ptr<float>(),
59 logits_tensor.data_ptr<float>() + logits_tensor.numel());
60
61 // Sample the next token from the logits.
62 int64_t next_token = sampler.sample(logits);
63
64 // Break if we reached the end of the text.
65 if (next_token == ENDOFTEXT_TOKEN) {
66 break;
67 }
68
69 // Add the next token to the output.
70 output_tokens.push_back(next_token);
71
72 std::cout << tokenizer.decode({next_token});
73 std::cout.flush();
74
75 // Update next input.
76 input_tokens.push_back(next_token);
77 if (input_tokens.size() > max_input_length) {
78 input_tokens.erase(input_tokens.begin());
79 }
80 }
81
82 std::cout << std::endl;
83
84 // Convert the output tokens into a human-readable string.
85 std::string output_string = tokenizer.decode(output_tokens);
86 return output_string;
87 }
88
main()89 int main() {
90 // Set up the prompt. This provides the seed text for the model to elaborate.
91 std::cout << "Enter model prompt: ";
92 std::string prompt;
93 std::getline(std::cin, prompt);
94
95 // The tokenizer is used to convert between tokens (used by the model) and
96 // human-readable strings.
97 BasicTokenizer tokenizer("vocab.json");
98
99 // The sampler is used to sample the next token from the logits.
100 BasicSampler sampler = BasicSampler();
101
102 // Load the exported nanoGPT program, which was generated via the previous
103 // steps.
104 Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
105
106 const auto max_input_tokens = 1024;
107 const auto max_output_tokens = 30;
108 std::cout << prompt;
109 generate(
110 model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
111 }
112