1 // Copyright 2020 The Pigweed Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not 4 // use this file except in compliance with the License. You may obtain a copy of 5 // the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 // License for the specific language governing permissions and limitations under 13 // the License. 14 15 // This file provides the Detokenizer class, which is used to decode tokenized 16 // strings. To use a Detokenizer, load a binary format token database into 17 // memory, construct a TokenDatabase, and pass it to a Detokenizer: 18 // 19 // std::vector data = ReadFile("my_tokenized_strings.db"); 20 // Detokenizer detok(TokenDatabase::Create(data)); 21 // 22 // DetokenizedString result = detok.Detokenize(my_data); 23 // std::cout << result.BestString() << '\n'; 24 // 25 #pragma once 26 27 #include <cstddef> 28 #include <cstdint> 29 #include <string> 30 #include <unordered_map> 31 #include <utility> 32 #include <vector> 33 34 #include "pw_result/result.h" 35 #include "pw_span/span.h" 36 #include "pw_stream/stream.h" 37 #include "pw_tokenizer/internal/decode.h" 38 #include "pw_tokenizer/token_database.h" 39 40 namespace pw::tokenizer { 41 42 /// @defgroup pw_tokenizer_detokenize 43 /// @{ 44 45 /// Token database entry. 46 using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>; 47 48 /// A string that has been detokenized. This class tracks all possible results 49 /// if there are token collisions. 50 class DetokenizedString { 51 public: 52 DetokenizedString(uint32_t token, 53 const span<const TokenizedStringEntry>& entries, 54 const span<const std::byte>& arguments); 55 DetokenizedString()56 DetokenizedString() : has_token_(false) {} 57 58 /// True if there was only one valid match and it decoded successfully. ok()59 bool ok() const { return matches_.size() == 1 && matches_[0].ok(); } 60 61 /// Returns the strings that matched the token, with the best matches first. matches()62 const std::vector<DecodedFormatString>& matches() const { return matches_; } 63 token()64 const uint32_t& token() const { return token_; } 65 66 /// Returns the detokenized string or an empty string if there were no 67 /// matches. If there are multiple possible results, the `DetokenizedString` 68 /// returns the first match. 69 std::string BestString() const; 70 71 /// Returns the best match, with error messages inserted for arguments that 72 /// failed to parse. 73 std::string BestStringWithErrors() const; 74 75 private: 76 uint32_t token_; 77 bool has_token_; 78 std::vector<DecodedFormatString> matches_; 79 }; 80 81 /// Decodes and detokenizes from a token database. This class builds a hash 82 /// table of tokens to give `O(1)` token lookups. 83 class Detokenizer { 84 public: 85 /// Constructs a detokenizer from a `TokenDatabase`. The `TokenDatabase` is 86 /// not referenced by the `Detokenizer` after construction; its memory can be 87 /// freed. 88 explicit Detokenizer(const TokenDatabase& database); 89 90 /// Constructs a detokenizer by directly passing the parsed database. Detokenizer(std::unordered_map<uint32_t,std::vector<TokenizedStringEntry>> && database)91 explicit Detokenizer( 92 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>>&& 93 database) 94 : database_(std::move(database)) {} 95 96 /// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an 97 /// ELF binary. 98 static Result<Detokenizer> FromElfSection(span<const std::byte> elf_section); 99 100 /// Overload of `FromElfSection` for a `uint8_t` span. FromElfSection(span<const uint8_t> elf_section)101 static Result<Detokenizer> FromElfSection(span<const uint8_t> elf_section) { 102 return FromElfSection(as_bytes(elf_section)); 103 } 104 105 /// Constructs a detokenizer from the `.pw_tokenizer.entries` section of an 106 /// ELF binary. 107 static Result<Detokenizer> FromElfFile(stream::SeekableReader& stream); 108 109 /// Decodes and detokenizes the binary encoded message. Returns a 110 /// `DetokenizedString` that stores all possible detokenized string results. 111 DetokenizedString Detokenize(const span<const std::byte>& encoded) const; 112 113 /// Overload of `Detokenize` for `span<const uint8_t>`. Detokenize(const span<const uint8_t> & encoded)114 DetokenizedString Detokenize(const span<const uint8_t>& encoded) const { 115 return Detokenize(as_bytes(encoded)); 116 } 117 118 /// Overload of `Detokenize` for `std::string_view`. Detokenize(std::string_view encoded)119 DetokenizedString Detokenize(std::string_view encoded) const { 120 return Detokenize(encoded.data(), encoded.size()); 121 } 122 123 /// Overload of `Detokenize` for a pointer and length. Detokenize(const void * encoded,size_t size_bytes)124 DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const { 125 return Detokenize(span(static_cast<const std::byte*>(encoded), size_bytes)); 126 } 127 128 /// Decodes and detokenizes a Base64-encoded message. Returns a 129 /// `DetokenizedString` that stores all possible detokenized string results. 130 DetokenizedString DetokenizeBase64Message(std::string_view text) const; 131 132 /// Decodes and detokenizes nested tokenized messages in a string. 133 /// 134 /// This function currently only supports Base64 nested tokenized messages. 135 /// Support for hexadecimal-encoded string literals will be added. 136 /// 137 /// @param[in] text Text potentially containing tokenized messages. 138 /// 139 /// @param[in] max_passes `DetokenizeText` supports recursive detokenization. 140 /// Tokens can expand to other tokens. The maximum number of detokenization 141 /// passes is specified by `max_passes` (0 is equivalent to 1). 142 /// 143 /// @returns The original string with nested tokenized messages decoded in 144 /// context. Messages that fail to decode are left as-is. 145 std::string DetokenizeText(std::string_view text, 146 unsigned max_passes = 3) const; 147 148 /// Deprecated version of `DetokenizeText` with no recursive detokenization. 149 /// @deprecated Call `DetokenizeText` instead. DetokenizeBase64(std::string_view text)150 [[deprecated("Use DetokenizeText() instead")]] std::string DetokenizeBase64( 151 std::string_view text) const { 152 return DetokenizeText(text, 1); 153 } 154 155 /// Decodes data that may or may not be tokenized, such as proto fields marked 156 /// as optionally tokenized. 157 /// 158 /// This function currently only supports Base64 nested tokenized messages. 159 /// Support for hexadecimal-encoded string literals will be added. 160 /// 161 /// This function currently assumes when data is not tokenized it is printable 162 /// ASCII. Otherwise, the returned string will be base64-encoded. 163 /// 164 /// @param[in] optionally_tokenized_data Data optionally tokenized. 165 /// 166 /// @returns The decoded text if successfully detokenized or if the data is 167 /// printable, otherwise returns the data base64-encoded. 168 std::string DecodeOptionallyTokenizedData( 169 const span<const std::byte>& optionally_tokenized_data); 170 171 private: 172 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_; 173 }; 174 175 /// @} 176 177 } // namespace pw::tokenizer 178