xref: /aosp_15_r20/external/executorch/extension/llm/tokenizer/tokenizer.h (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 #include <cinttypes>
12 #include <string>
13 #include <vector>
14 
15 #include <executorch/runtime/core/error.h>
16 #include <executorch/runtime/core/result.h>
17 #include <executorch/runtime/platform/compiler.h>
18 
19 namespace executorch {
20 namespace extension {
21 namespace llm {
22 
23 // A tokenizer interface.
24 class ET_EXPERIMENTAL Tokenizer {
25  public:
Tokenizer()26   explicit Tokenizer() {}
~Tokenizer()27   virtual ~Tokenizer() {}
28 
29   virtual ::executorch::runtime::Error load(
30       const std::string& tokenizer_path) = 0;
31 
32   virtual ::executorch::runtime::Result<std::vector<uint64_t>>
33   encode(const std::string& input, int8_t bos, int8_t eos) const = 0;
34 
decode_verify(uint64_t token)35   ::executorch::runtime::Error decode_verify(uint64_t token) const {
36     if (!initialized_) {
37       ET_LOG(Error, "Tokenizer not initialized");
38       return ::executorch::runtime::Error::NotSupported;
39     }
40     if (token >= vocab_size_) {
41       ET_LOG(
42           Error,
43           "token  %" PRIu64 " is out side of vacab range %d",
44           token,
45           vocab_size_);
46       return ::executorch::runtime::Error::NotSupported;
47     }
48     return ::executorch::runtime::Error::Ok;
49   }
50 
51   virtual ::executorch::runtime::Result<std::string> decode(
52       uint64_t prev_token,
53       uint64_t token) const = 0;
54 
55   // getters
vocab_size()56   int32_t vocab_size() const {
57     return vocab_size_;
58   }
59 
bos_tok()60   uint64_t bos_tok() const {
61     return bos_tok_;
62   }
63 
eos_tok()64   uint64_t eos_tok() const {
65     return eos_tok_;
66   }
67 
68  protected:
69   bool initialized_ = false;
70   int32_t vocab_size_ = 0;
71   uint64_t bos_tok_ = 0;
72   uint64_t eos_tok_ = 0;
73 };
74 
75 } // namespace llm
76 } // namespace extension
77 } // namespace executorch
78 
79 namespace torch {
80 namespace executor {
81 // TODO(T197294990): Remove these deprecated aliases once all users have moved
82 // to the new `::executorch` namespaces.
83 using ::executorch::extension::llm::Tokenizer;
84 } // namespace executor
85 } // namespace torch
86