1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under the BSD-style license found in the 6 * LICENSE file in the root directory of this source tree. 7 */ 8 9 #pragma once 10 11 #include <cinttypes> 12 #include <string> 13 #include <vector> 14 15 #include <executorch/runtime/core/error.h> 16 #include <executorch/runtime/core/result.h> 17 #include <executorch/runtime/platform/compiler.h> 18 19 namespace executorch { 20 namespace extension { 21 namespace llm { 22 23 // A tokenizer interface. 24 class ET_EXPERIMENTAL Tokenizer { 25 public: Tokenizer()26 explicit Tokenizer() {} ~Tokenizer()27 virtual ~Tokenizer() {} 28 29 virtual ::executorch::runtime::Error load( 30 const std::string& tokenizer_path) = 0; 31 32 virtual ::executorch::runtime::Result<std::vector<uint64_t>> 33 encode(const std::string& input, int8_t bos, int8_t eos) const = 0; 34 decode_verify(uint64_t token)35 ::executorch::runtime::Error decode_verify(uint64_t token) const { 36 if (!initialized_) { 37 ET_LOG(Error, "Tokenizer not initialized"); 38 return ::executorch::runtime::Error::NotSupported; 39 } 40 if (token >= vocab_size_) { 41 ET_LOG( 42 Error, 43 "token %" PRIu64 " is out side of vacab range %d", 44 token, 45 vocab_size_); 46 return ::executorch::runtime::Error::NotSupported; 47 } 48 return ::executorch::runtime::Error::Ok; 49 } 50 51 virtual ::executorch::runtime::Result<std::string> decode( 52 uint64_t prev_token, 53 uint64_t token) const = 0; 54 55 // getters vocab_size()56 int32_t vocab_size() const { 57 return vocab_size_; 58 } 59 bos_tok()60 uint64_t bos_tok() const { 61 return bos_tok_; 62 } 63 eos_tok()64 uint64_t eos_tok() const { 65 return eos_tok_; 66 } 67 68 protected: 69 bool initialized_ = false; 70 int32_t vocab_size_ = 0; 71 uint64_t bos_tok_ = 0; 72 uint64_t eos_tok_ = 0; 73 }; 74 75 } // namespace llm 76 } // namespace extension 77 } // namespace executorch 78 79 namespace torch { 80 namespace executor { 81 // TODO(T197294990): Remove these deprecated aliases once all users have moved 82 // to the new `::executorch` namespaces. 83 using ::executorch::extension::llm::Tokenizer; 84 } // namespace executor 85 } // namespace torch 86