1 // Copyright 2020 The Pigweed Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not 4 // use this file except in compliance with the License. You may obtain a copy of 5 // the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 // License for the specific language governing permissions and limitations under 13 // the License. 14 #pragma once 15 16 #include <array> 17 #include <cstddef> 18 #include <cstdint> 19 #include <iterator> 20 21 namespace pw::tokenizer { 22 23 /// Reads entries from a v0 binary token string database. This class does not 24 /// copy or modify the contents of the database. 25 /// 26 /// The v0 token database has two significant shortcomings: 27 /// 28 /// - Strings cannot contain null terminators (`\0`). If a string contains a 29 /// `\0`, the database will not work correctly. 30 /// - The domain is not included in entries. All tokens belong to a single 31 /// domain, which must be known independently. 32 /// 33 /// A v0 binary token database is comprised of a 16-byte header followed by an 34 /// array of 8-byte entries and a table of null-terminated strings. The header 35 /// specifies the number of entries. Each entry contains information about a 36 /// tokenized string: the token and removal date, if any. All fields are little- 37 /// endian. 38 /// 39 /// The token removal date is stored within an unsigned 32-bit integer. It is 40 /// stored as `<day> <month> <year>`, where `<day>` and `<month>` are 1 byte 41 /// each and `<year>` is two bytes. The fields are set to their maximum value 42 /// (`0xFF` or `0xFFFF`) if they are unset. With this format, dates may be 43 /// compared naturally as unsigned integers. 44 /// 45 /// @rst 46 /// ====== ==== ========================= 47 /// Header (16 bytes) 48 /// --------------------------------------- 49 /// Offset Size Field 50 /// ====== ==== ========================= 51 /// 0 6 Magic number (``TOKENS``) 52 /// 6 2 Version (``00 00``) 53 /// 8 4 Entry count 54 /// 12 4 Reserved 55 /// ====== ==== ========================= 56 /// 57 /// ====== ==== ================================== 58 /// Entry (8 bytes) 59 /// ------------------------------------------------ 60 /// Offset Size Field 61 /// ====== ==== ================================== 62 /// 0 4 Token 63 /// 4 1 Removal day (1-31, 255 if unset) 64 /// 5 1 Removal month (1-12, 255 if unset) 65 /// 6 2 Removal year (65535 if unset) 66 /// ====== ==== ================================== 67 /// @endrst 68 /// 69 /// Entries are sorted by token. A string table with a null-terminated string 70 /// for each entry in order follows the entries. 71 /// 72 /// Entries are accessed by iterating over the database. A O(n) `Find` function 73 /// is also provided. In typical use, a `TokenDatabase` is preprocessed by a 74 /// `pw::tokenizer::Detokenizer` into a `std::unordered_map`. 75 class TokenDatabase { 76 private: 77 // Internal struct that describes how the underlying binary token database 78 // stores entries. RawEntries generally should not be used directly. Instead, 79 // use an Entry, which contains a pointer to the entry's string. 80 struct RawEntry { 81 uint32_t token; 82 uint32_t date_removed; 83 }; 84 85 static_assert(sizeof(RawEntry) == 8u); 86 87 template <typename T> ReadUint32(const T * bytes)88 static constexpr uint32_t ReadUint32(const T* bytes) { 89 return static_cast<uint8_t>(bytes[0]) | 90 static_cast<uint8_t>(bytes[1]) << 8 | 91 static_cast<uint8_t>(bytes[2]) << 16 | 92 static_cast<uint8_t>(bytes[3]) << 24; 93 } 94 95 public: 96 /// Default date_removed for an entry in the token datase if it was never 97 /// removed. 98 static constexpr uint32_t kDateRemovedNever = 0xFFFFFFFF; 99 100 /// An entry in the token database. 101 struct Entry { 102 /// The token that represents this string. 103 uint32_t token; 104 105 /// The date the token and string was removed from the database, or 106 /// kDateRemovedNever if it was never removed. Dates are encoded such that 107 /// natural integer sorting sorts from oldest to newest dates. The day is 108 /// stored an an 8-bit day, 8-bit month, and 16-bit year, packed into a 109 /// little-endian `uint32_t`. 110 uint32_t date_removed; 111 112 /// The null-terminated string represented by this token. 113 const char* string; 114 }; 115 116 /// Iterator for `TokenDatabase` values. 117 class iterator { 118 public: 119 using difference_type = std::ptrdiff_t; 120 using value_type = Entry; 121 using pointer = const Entry*; 122 using reference = const Entry&; 123 using iterator_category = std::forward_iterator_tag; 124 iterator()125 constexpr iterator() : entry_{}, raw_(nullptr) {} 126 127 constexpr iterator(const iterator& other) = default; 128 constexpr iterator& operator=(const iterator& other) = default; 129 130 constexpr iterator& operator++() { 131 raw_ += sizeof(RawEntry); 132 ReadRawEntry(); 133 // Move string_ to the character beyond the next null terminator. 134 while (*entry_.string++ != '\0') { 135 } 136 return *this; 137 } 138 constexpr iterator operator++(int) { 139 iterator previous(*this); 140 operator++(); 141 return previous; 142 } 143 constexpr bool operator==(const iterator& rhs) const { 144 return raw_ == rhs.raw_; 145 } 146 constexpr bool operator!=(const iterator& rhs) const { 147 return raw_ != rhs.raw_; 148 } 149 150 constexpr const Entry& operator*() const { return entry_; } 151 152 constexpr const Entry* operator->() const { return &entry_; } 153 154 constexpr difference_type operator-(const iterator& rhs) const { 155 return (raw_ - rhs.raw_) / sizeof(RawEntry); 156 } 157 158 private: 159 friend class TokenDatabase; 160 161 // Constructs a new iterator to a valid entry. iterator(const char * raw_entry,const char * string)162 constexpr iterator(const char* raw_entry, const char* string) 163 : entry_{0, 0, string}, raw_{raw_entry} { 164 if (raw_entry != string) { // raw_entry == string if the DB is empty 165 ReadRawEntry(); 166 } 167 } 168 iterator(const char * end)169 explicit constexpr iterator(const char* end) : entry_{}, raw_(end) {} 170 ReadRawEntry()171 constexpr void ReadRawEntry() { 172 entry_.token = ReadUint32(raw_); 173 entry_.date_removed = ReadUint32(raw_ + sizeof(entry_.token)); 174 } 175 176 Entry entry_; 177 const char* raw_; 178 }; 179 180 using value_type = Entry; 181 using size_type = std::size_t; 182 using difference_type = std::ptrdiff_t; 183 using reference = value_type&; 184 using const_reference = const value_type&; 185 using pointer = const value_type*; 186 using const_pointer = const value_type*; 187 using const_iterator = iterator; 188 using reverse_iterator = std::reverse_iterator<iterator>; 189 using const_reverse_iterator = std::reverse_iterator<const_iterator>; 190 191 /// A list of token entries returned from a `Find` operation. This object can 192 /// be iterated over or indexed as an array. 193 class Entries { 194 public: Entries(const iterator & begin,const iterator & end)195 constexpr Entries(const iterator& begin, const iterator& end) 196 : begin_(begin), end_(end) {} 197 198 // The number of entries in this list. size()199 constexpr size_type size() const { return end_ - begin_; } 200 201 // True of the list is empty. empty()202 constexpr bool empty() const { return begin_ == end_; } 203 204 // Accesses the specified entry in this set. The index must be less than 205 // size(). This operation is O(n) in size(). 206 Entry operator[](size_type index) const; 207 begin()208 constexpr const iterator& begin() const { return begin_; } end()209 constexpr const iterator& end() const { return end_; } 210 211 private: 212 iterator begin_; 213 iterator end_; 214 }; 215 216 /// Returns true if the provided data is a valid token database. This checks 217 /// the magic number (`TOKENS`), version (which must be `0`), and that there 218 /// is is one string for each entry in the database. A database with extra 219 /// strings or other trailing data is considered valid. 220 template <typename ByteArray> IsValid(const ByteArray & bytes)221 static constexpr bool IsValid(const ByteArray& bytes) { 222 return HasValidHeader(bytes) && EachEntryHasAString(bytes); 223 } 224 225 /// Creates a `TokenDatabase` and checks if the provided data is valid at 226 /// compile time. Accepts references to constexpr containers (`array`, `span`, 227 /// `string_view`, etc.) with static storage duration. For example: 228 /// 229 /// @code{.cpp} 230 /// 231 /// constexpr char kMyData[] = ...; 232 /// constexpr TokenDatabase db = TokenDatabase::Create<kMyData>(); 233 /// 234 /// @endcode 235 template <const auto& kDatabaseBytes> Create()236 static constexpr TokenDatabase Create() { 237 static_assert( 238 HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes), 239 "Databases must start with a 16-byte header that begins with TOKENS."); 240 241 static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes), 242 "The database must have at least one string for each entry."); 243 244 return TokenDatabase(std::data(kDatabaseBytes)); 245 } 246 247 /// Creates a `TokenDatabase` from the provided byte array. The array may be a 248 /// span, array, or other container type. If the data is not valid, returns a 249 /// default-constructed database for which ok() is false. 250 /// 251 /// Prefer the `Create` overload that takes the data as a template parameter 252 /// when possible, since that overload verifies data integrity at compile 253 /// time. 254 template <typename ByteArray> Create(const ByteArray & database_bytes)255 static constexpr TokenDatabase Create(const ByteArray& database_bytes) { 256 return IsValid<ByteArray>(database_bytes) 257 ? TokenDatabase(std::data(database_bytes)) 258 : TokenDatabase(); // Invalid database. 259 } 260 /// Creates a database with no data. `ok()` returns false. TokenDatabase()261 constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {} 262 263 /// Returns all entries associated with this token. This is `O(n)`. 264 Entries Find(uint32_t token) const; 265 266 /// Returns the total number of entries (unique token-string pairs). size()267 constexpr size_type size() const { 268 return (end_.data - begin_.data) / sizeof(RawEntry); 269 } 270 271 /// True if this database was constructed with valid data. The database might 272 /// be empty, but it has an intact header and a string for each entry. ok()273 constexpr bool ok() const { return begin_.data != nullptr; } 274 275 /// Returns an iterator for the first token entry. begin()276 constexpr iterator begin() const { return iterator(begin_.data, end_.data); } 277 278 /// Returns an iterator for one past the last token entry. end()279 constexpr iterator end() const { return iterator(end_.data); } 280 281 private: 282 struct Header { 283 std::array<char, 6> magic; 284 uint16_t version; 285 uint32_t entry_count; 286 uint32_t reserved; 287 }; 288 289 static_assert(sizeof(Header) == 2 * sizeof(RawEntry)); 290 291 template <typename ByteArray> HasValidHeader(const ByteArray & bytes)292 static constexpr bool HasValidHeader(const ByteArray& bytes) { 293 static_assert(sizeof(*std::data(bytes)) == 1u); 294 295 if (std::size(bytes) < sizeof(Header)) { 296 return false; 297 } 298 299 // Check the magic number and version. 300 for (size_type i = 0; i < kMagicAndVersion.size(); ++i) { 301 if (bytes[i] != kMagicAndVersion[i]) { 302 return false; 303 } 304 } 305 306 return true; 307 } 308 309 template <typename ByteArray> EachEntryHasAString(const ByteArray & bytes)310 static constexpr bool EachEntryHasAString(const ByteArray& bytes) { 311 const size_type entries = ReadEntryCount(std::data(bytes)); 312 313 // Check that the data is large enough to have a string table. 314 if (std::size(bytes) < StringTable(entries)) { 315 return false; 316 } 317 318 // Count the strings in the string table. 319 size_type string_count = 0; 320 for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes); 321 ++i) { 322 string_count += (*i == '\0') ? 1 : 0; 323 } 324 325 // Check that there is at least one string for each entry. 326 return string_count >= entries; 327 } 328 329 // Reads the number of entries from a database header. Cast to the bytes to 330 // uint8_t to avoid sign extension if T is signed. 331 template <typename T> ReadEntryCount(const T * header_bytes)332 static constexpr uint32_t ReadEntryCount(const T* header_bytes) { 333 const T* bytes = header_bytes + offsetof(Header, entry_count); 334 return ReadUint32(bytes); 335 } 336 337 // Calculates the offset of the string table. StringTable(size_type entries)338 static constexpr size_type StringTable(size_type entries) { 339 return sizeof(Header) + entries * sizeof(RawEntry); 340 } 341 342 // The magic number that starts the table is "TOKENS". The version is encoded 343 // next as two bytes. 344 static constexpr std::array<char, 8> kMagicAndVersion = { 345 'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'}; 346 347 template <typename Byte> TokenDatabase(const Byte bytes[])348 constexpr TokenDatabase(const Byte bytes[]) 349 : TokenDatabase(bytes + sizeof(Header), 350 bytes + StringTable(ReadEntryCount(bytes))) { 351 static_assert(sizeof(Byte) == 1u); 352 } 353 354 // It is illegal to reinterpret_cast in constexpr functions, but acceptable to 355 // use unions. Instead of using a reinterpret_cast to change the byte pointer 356 // to a RawEntry pointer, have a separate overload for each byte pointer type 357 // and store them in a union. TokenDatabase(const char * begin,const char * end)358 constexpr TokenDatabase(const char* begin, const char* end) 359 : begin_{.data = begin}, end_{.data = end} {} 360 TokenDatabase(const unsigned char * begin,const unsigned char * end)361 constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end) 362 : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {} 363 TokenDatabase(const signed char * begin,const signed char * end)364 constexpr TokenDatabase(const signed char* begin, const signed char* end) 365 : begin_{.signed_data = begin}, end_{.signed_data = end} {} 366 367 // Store the beginning and end pointers as a union to avoid breaking constexpr 368 // rules for reinterpret_cast. 369 union { 370 const char* data; 371 const unsigned char* unsigned_data; 372 const signed char* signed_data; 373 } begin_, end_; 374 }; 375 376 } // namespace pw::tokenizer 377