xref: /aosp_15_r20/external/pigweed/pw_tokenizer/public/pw_tokenizer/token_database.h (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 #pragma once
15 
16 #include <array>
17 #include <cstddef>
18 #include <cstdint>
19 #include <iterator>
20 
21 namespace pw::tokenizer {
22 
23 /// Reads entries from a v0 binary token string database. This class does not
24 /// copy or modify the contents of the database.
25 ///
26 /// The v0 token database has two significant shortcomings:
27 ///
28 ///   - Strings cannot contain null terminators (`\0`). If a string contains a
29 ///     `\0`, the database will not work correctly.
30 ///   - The domain is not included in entries. All tokens belong to a single
31 ///     domain, which must be known independently.
32 ///
33 /// A v0 binary token database is comprised of a 16-byte header followed by an
34 /// array of 8-byte entries and a table of null-terminated strings. The header
35 /// specifies the number of entries. Each entry contains information about a
36 /// tokenized string: the token and removal date, if any. All fields are little-
37 /// endian.
38 ///
39 /// The token removal date is stored within an unsigned 32-bit integer. It is
40 /// stored as `<day> <month> <year>`, where `<day>` and `<month>` are 1 byte
41 /// each and `<year>` is two bytes. The fields are set to their maximum value
42 /// (`0xFF` or `0xFFFF`) if they are unset. With this format, dates may be
43 /// compared naturally as unsigned integers.
44 ///
45 /// @rst
46 ///   ======  ====  =========================
47 ///   Header (16 bytes)
48 ///   ---------------------------------------
49 ///   Offset  Size  Field
50 ///   ======  ====  =========================
51 ///        0     6  Magic number (``TOKENS``)
52 ///        6     2  Version (``00 00``)
53 ///        8     4  Entry count
54 ///       12     4  Reserved
55 ///   ======  ====  =========================
56 ///
57 ///   ======  ====  ==================================
58 ///   Entry (8 bytes)
59 ///   ------------------------------------------------
60 ///   Offset  Size  Field
61 ///   ======  ====  ==================================
62 ///        0     4  Token
63 ///        4     1  Removal day (1-31, 255 if unset)
64 ///        5     1  Removal month (1-12, 255 if unset)
65 ///        6     2  Removal year (65535 if unset)
66 ///   ======  ====  ==================================
67 /// @endrst
68 ///
69 /// Entries are sorted by token. A string table with a null-terminated string
70 /// for each entry in order follows the entries.
71 ///
72 /// Entries are accessed by iterating over the database. A O(n) `Find` function
73 /// is also provided. In typical use, a `TokenDatabase` is preprocessed by a
74 /// `pw::tokenizer::Detokenizer` into a `std::unordered_map`.
75 class TokenDatabase {
76  private:
77   // Internal struct that describes how the underlying binary token database
78   // stores entries. RawEntries generally should not be used directly. Instead,
79   // use an Entry, which contains a pointer to the entry's string.
80   struct RawEntry {
81     uint32_t token;
82     uint32_t date_removed;
83   };
84 
85   static_assert(sizeof(RawEntry) == 8u);
86 
87   template <typename T>
ReadUint32(const T * bytes)88   static constexpr uint32_t ReadUint32(const T* bytes) {
89     return static_cast<uint8_t>(bytes[0]) |
90            static_cast<uint8_t>(bytes[1]) << 8 |
91            static_cast<uint8_t>(bytes[2]) << 16 |
92            static_cast<uint8_t>(bytes[3]) << 24;
93   }
94 
95  public:
96   /// Default date_removed for an entry in the token datase if it was never
97   /// removed.
98   static constexpr uint32_t kDateRemovedNever = 0xFFFFFFFF;
99 
100   /// An entry in the token database.
101   struct Entry {
102     /// The token that represents this string.
103     uint32_t token;
104 
105     /// The date the token and string was removed from the database, or
106     /// kDateRemovedNever if it was never removed. Dates are encoded such that
107     /// natural integer sorting sorts from oldest to newest dates. The day is
108     /// stored an an 8-bit day, 8-bit month, and 16-bit year, packed into a
109     /// little-endian `uint32_t`.
110     uint32_t date_removed;
111 
112     /// The null-terminated string represented by this token.
113     const char* string;
114   };
115 
116   /// Iterator for `TokenDatabase` values.
117   class iterator {
118    public:
119     using difference_type = std::ptrdiff_t;
120     using value_type = Entry;
121     using pointer = const Entry*;
122     using reference = const Entry&;
123     using iterator_category = std::forward_iterator_tag;
124 
iterator()125     constexpr iterator() : entry_{}, raw_(nullptr) {}
126 
127     constexpr iterator(const iterator& other) = default;
128     constexpr iterator& operator=(const iterator& other) = default;
129 
130     constexpr iterator& operator++() {
131       raw_ += sizeof(RawEntry);
132       ReadRawEntry();
133       // Move string_ to the character beyond the next null terminator.
134       while (*entry_.string++ != '\0') {
135       }
136       return *this;
137     }
138     constexpr iterator operator++(int) {
139       iterator previous(*this);
140       operator++();
141       return previous;
142     }
143     constexpr bool operator==(const iterator& rhs) const {
144       return raw_ == rhs.raw_;
145     }
146     constexpr bool operator!=(const iterator& rhs) const {
147       return raw_ != rhs.raw_;
148     }
149 
150     constexpr const Entry& operator*() const { return entry_; }
151 
152     constexpr const Entry* operator->() const { return &entry_; }
153 
154     constexpr difference_type operator-(const iterator& rhs) const {
155       return (raw_ - rhs.raw_) / sizeof(RawEntry);
156     }
157 
158    private:
159     friend class TokenDatabase;
160 
161     // Constructs a new iterator to a valid entry.
iterator(const char * raw_entry,const char * string)162     constexpr iterator(const char* raw_entry, const char* string)
163         : entry_{0, 0, string}, raw_{raw_entry} {
164       if (raw_entry != string) {  // raw_entry == string if the DB is empty
165         ReadRawEntry();
166       }
167     }
168 
iterator(const char * end)169     explicit constexpr iterator(const char* end) : entry_{}, raw_(end) {}
170 
ReadRawEntry()171     constexpr void ReadRawEntry() {
172       entry_.token = ReadUint32(raw_);
173       entry_.date_removed = ReadUint32(raw_ + sizeof(entry_.token));
174     }
175 
176     Entry entry_;
177     const char* raw_;
178   };
179 
180   using value_type = Entry;
181   using size_type = std::size_t;
182   using difference_type = std::ptrdiff_t;
183   using reference = value_type&;
184   using const_reference = const value_type&;
185   using pointer = const value_type*;
186   using const_pointer = const value_type*;
187   using const_iterator = iterator;
188   using reverse_iterator = std::reverse_iterator<iterator>;
189   using const_reverse_iterator = std::reverse_iterator<const_iterator>;
190 
191   /// A list of token entries returned from a `Find` operation. This object can
192   /// be iterated over or indexed as an array.
193   class Entries {
194    public:
Entries(const iterator & begin,const iterator & end)195     constexpr Entries(const iterator& begin, const iterator& end)
196         : begin_(begin), end_(end) {}
197 
198     // The number of entries in this list.
size()199     constexpr size_type size() const { return end_ - begin_; }
200 
201     // True of the list is empty.
empty()202     constexpr bool empty() const { return begin_ == end_; }
203 
204     // Accesses the specified entry in this set. The index must be less than
205     // size(). This operation is O(n) in size().
206     Entry operator[](size_type index) const;
207 
begin()208     constexpr const iterator& begin() const { return begin_; }
end()209     constexpr const iterator& end() const { return end_; }
210 
211    private:
212     iterator begin_;
213     iterator end_;
214   };
215 
216   /// Returns true if the provided data is a valid token database. This checks
217   /// the magic number (`TOKENS`), version (which must be `0`), and that there
218   /// is is one string for each entry in the database. A database with extra
219   /// strings or other trailing data is considered valid.
220   template <typename ByteArray>
IsValid(const ByteArray & bytes)221   static constexpr bool IsValid(const ByteArray& bytes) {
222     return HasValidHeader(bytes) && EachEntryHasAString(bytes);
223   }
224 
225   /// Creates a `TokenDatabase` and checks if the provided data is valid at
226   /// compile time. Accepts references to constexpr containers (`array`, `span`,
227   /// `string_view`, etc.) with static storage duration. For example:
228   ///
229   ///  @code{.cpp}
230   ///
231   ///    constexpr char kMyData[] = ...;
232   ///    constexpr TokenDatabase db = TokenDatabase::Create<kMyData>();
233   ///
234   ///  @endcode
235   template <const auto& kDatabaseBytes>
Create()236   static constexpr TokenDatabase Create() {
237     static_assert(
238         HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes),
239         "Databases must start with a 16-byte header that begins with TOKENS.");
240 
241     static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes),
242                   "The database must have at least one string for each entry.");
243 
244     return TokenDatabase(std::data(kDatabaseBytes));
245   }
246 
247   /// Creates a `TokenDatabase` from the provided byte array. The array may be a
248   /// span, array, or other container type. If the data is not valid, returns a
249   /// default-constructed database for which ok() is false.
250   ///
251   /// Prefer the `Create` overload that takes the data as a template parameter
252   /// when possible, since that overload verifies data integrity at compile
253   /// time.
254   template <typename ByteArray>
Create(const ByteArray & database_bytes)255   static constexpr TokenDatabase Create(const ByteArray& database_bytes) {
256     return IsValid<ByteArray>(database_bytes)
257                ? TokenDatabase(std::data(database_bytes))
258                : TokenDatabase();  // Invalid database.
259   }
260   /// Creates a database with no data. `ok()` returns false.
TokenDatabase()261   constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {}
262 
263   /// Returns all entries associated with this token. This is `O(n)`.
264   Entries Find(uint32_t token) const;
265 
266   /// Returns the total number of entries (unique token-string pairs).
size()267   constexpr size_type size() const {
268     return (end_.data - begin_.data) / sizeof(RawEntry);
269   }
270 
271   /// True if this database was constructed with valid data. The database might
272   /// be empty, but it has an intact header and a string for each entry.
ok()273   constexpr bool ok() const { return begin_.data != nullptr; }
274 
275   /// Returns an iterator for the first token entry.
begin()276   constexpr iterator begin() const { return iterator(begin_.data, end_.data); }
277 
278   /// Returns an iterator for one past the last token entry.
end()279   constexpr iterator end() const { return iterator(end_.data); }
280 
281  private:
282   struct Header {
283     std::array<char, 6> magic;
284     uint16_t version;
285     uint32_t entry_count;
286     uint32_t reserved;
287   };
288 
289   static_assert(sizeof(Header) == 2 * sizeof(RawEntry));
290 
291   template <typename ByteArray>
HasValidHeader(const ByteArray & bytes)292   static constexpr bool HasValidHeader(const ByteArray& bytes) {
293     static_assert(sizeof(*std::data(bytes)) == 1u);
294 
295     if (std::size(bytes) < sizeof(Header)) {
296       return false;
297     }
298 
299     // Check the magic number and version.
300     for (size_type i = 0; i < kMagicAndVersion.size(); ++i) {
301       if (bytes[i] != kMagicAndVersion[i]) {
302         return false;
303       }
304     }
305 
306     return true;
307   }
308 
309   template <typename ByteArray>
EachEntryHasAString(const ByteArray & bytes)310   static constexpr bool EachEntryHasAString(const ByteArray& bytes) {
311     const size_type entries = ReadEntryCount(std::data(bytes));
312 
313     // Check that the data is large enough to have a string table.
314     if (std::size(bytes) < StringTable(entries)) {
315       return false;
316     }
317 
318     // Count the strings in the string table.
319     size_type string_count = 0;
320     for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes);
321          ++i) {
322       string_count += (*i == '\0') ? 1 : 0;
323     }
324 
325     // Check that there is at least one string for each entry.
326     return string_count >= entries;
327   }
328 
329   // Reads the number of entries from a database header. Cast to the bytes to
330   // uint8_t to avoid sign extension if T is signed.
331   template <typename T>
ReadEntryCount(const T * header_bytes)332   static constexpr uint32_t ReadEntryCount(const T* header_bytes) {
333     const T* bytes = header_bytes + offsetof(Header, entry_count);
334     return ReadUint32(bytes);
335   }
336 
337   // Calculates the offset of the string table.
StringTable(size_type entries)338   static constexpr size_type StringTable(size_type entries) {
339     return sizeof(Header) + entries * sizeof(RawEntry);
340   }
341 
342   // The magic number that starts the table is "TOKENS". The version is encoded
343   // next as two bytes.
344   static constexpr std::array<char, 8> kMagicAndVersion = {
345       'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'};
346 
347   template <typename Byte>
TokenDatabase(const Byte bytes[])348   constexpr TokenDatabase(const Byte bytes[])
349       : TokenDatabase(bytes + sizeof(Header),
350                       bytes + StringTable(ReadEntryCount(bytes))) {
351     static_assert(sizeof(Byte) == 1u);
352   }
353 
354   // It is illegal to reinterpret_cast in constexpr functions, but acceptable to
355   // use unions. Instead of using a reinterpret_cast to change the byte pointer
356   // to a RawEntry pointer, have a separate overload for each byte pointer type
357   // and store them in a union.
TokenDatabase(const char * begin,const char * end)358   constexpr TokenDatabase(const char* begin, const char* end)
359       : begin_{.data = begin}, end_{.data = end} {}
360 
TokenDatabase(const unsigned char * begin,const unsigned char * end)361   constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end)
362       : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {}
363 
TokenDatabase(const signed char * begin,const signed char * end)364   constexpr TokenDatabase(const signed char* begin, const signed char* end)
365       : begin_{.signed_data = begin}, end_{.signed_data = end} {}
366 
367   // Store the beginning and end pointers as a union to avoid breaking constexpr
368   // rules for reinterpret_cast.
369   union {
370     const char* data;
371     const unsigned char* unsigned_data;
372     const signed char* signed_data;
373   } begin_, end_;
374 };
375 
376 }  // namespace pw::tokenizer
377