xref: /aosp_15_r20/external/pigweed/pw_tokenizer/detokenize.cc (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 #include "pw_tokenizer/detokenize.h"
16 
17 #include <algorithm>
18 #include <cctype>
19 #include <cstring>
20 #include <string_view>
21 #include <vector>
22 
23 #include "pw_bytes/bit.h"
24 #include "pw_bytes/endian.h"
25 #include "pw_elf/reader.h"
26 #include "pw_result/result.h"
27 #include "pw_status/try.h"
28 #include "pw_tokenizer/base64.h"
29 #include "pw_tokenizer/internal/decode.h"
30 #include "pw_tokenizer/nested_tokenization.h"
31 
32 namespace pw::tokenizer {
33 namespace {
34 
35 class NestedMessageDetokenizer {
36  public:
NestedMessageDetokenizer(const Detokenizer & detokenizer)37   NestedMessageDetokenizer(const Detokenizer& detokenizer)
38       : detokenizer_(detokenizer) {}
39 
Detokenize(std::string_view chunk)40   void Detokenize(std::string_view chunk) {
41     for (char next_char : chunk) {
42       Detokenize(next_char);
43     }
44   }
45 
OutputChangedSinceLastCheck()46   bool OutputChangedSinceLastCheck() {
47     const bool changed = output_changed_;
48     output_changed_ = false;
49     return changed;
50   }
51 
Detokenize(char next_char)52   void Detokenize(char next_char) {
53     switch (state_) {
54       case kNonMessage:
55         if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
56           message_buffer_.push_back(next_char);
57           state_ = kMessage;
58         } else {
59           output_.push_back(next_char);
60         }
61         break;
62       case kMessage:
63         if (base64::IsValidChar(next_char)) {
64           message_buffer_.push_back(next_char);
65         } else {
66           HandleEndOfMessage();
67           if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
68             message_buffer_.push_back(next_char);
69           } else {
70             output_.push_back(next_char);
71             state_ = kNonMessage;
72           }
73         }
74         break;
75     }
76   }
77 
Flush()78   std::string Flush() {
79     if (state_ == kMessage) {
80       HandleEndOfMessage();
81       state_ = kNonMessage;
82     }
83     std::string output(std::move(output_));
84     output_.clear();
85     return output;
86   }
87 
88  private:
HandleEndOfMessage()89   void HandleEndOfMessage() {
90     if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
91         result.ok()) {
92       output_ += result.BestString();
93       output_changed_ = true;
94     } else {
95       output_ += message_buffer_;  // Keep the original if it doesn't decode.
96     }
97     message_buffer_.clear();
98   }
99 
100   const Detokenizer& detokenizer_;
101   std::string output_;
102   std::string message_buffer_;
103 
104   enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
105   bool output_changed_ = false;
106 };
107 
UnknownTokenMessage(uint32_t value)108 std::string UnknownTokenMessage(uint32_t value) {
109   std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
110 
111   // Output a hexadecimal version of the token.
112   for (int shift = 28; shift >= 0; shift -= 4) {
113     output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
114   }
115 
116   output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
117   return output;
118 }
119 
120 // Decoding result with the date removed, for sorting.
121 using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
122 
123 // Determines if one result is better than the other if collisions occurred.
124 // Returns true if lhs is preferred over rhs. This logic should match the
125 // collision resolution logic in detokenize.py.
IsBetterResult(const DecodingResult & lhs,const DecodingResult & rhs)126 bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
127   // Favor the result for which decoding succeeded.
128   if (lhs.first.ok() != rhs.first.ok()) {
129     return lhs.first.ok();
130   }
131 
132   // Favor the result for which all bytes were decoded.
133   if ((lhs.first.remaining_bytes() == 0u) !=
134       (rhs.first.remaining_bytes() == 0u)) {
135     return lhs.first.remaining_bytes() == 0u;
136   }
137 
138   // Favor the result with fewer decoding errors.
139   if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
140     return lhs.first.decoding_errors() < rhs.first.decoding_errors();
141   }
142 
143   // Favor the result that successfully decoded the most arguments.
144   if (lhs.first.argument_count() != rhs.first.argument_count()) {
145     return lhs.first.argument_count() > rhs.first.argument_count();
146   }
147 
148   // Favor the result that was removed from the database most recently.
149   return lhs.second > rhs.second;
150 }
151 
152 // Returns true if all characters in data are printable, space, or if the string
153 // is empty.
IsPrintableAscii(std::string_view data)154 constexpr bool IsPrintableAscii(std::string_view data) {
155   // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
156   //
157   //   if ''.join(text.split()).isprintable():
158   //     return text
159   //
160   for (int letter : data) {
161     if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
162       return false;
163     }
164   }
165   return true;
166 }
167 
168 }  // namespace
169 
DetokenizedString(uint32_t token,const span<const TokenizedStringEntry> & entries,const span<const std::byte> & arguments)170 DetokenizedString::DetokenizedString(
171     uint32_t token,
172     const span<const TokenizedStringEntry>& entries,
173     const span<const std::byte>& arguments)
174     : token_(token), has_token_(true) {
175   std::vector<DecodingResult> results;
176 
177   for (const auto& [format, date_removed] : entries) {
178     results.push_back(DecodingResult{
179         format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
180                            arguments.size())),
181         date_removed});
182   }
183 
184   std::sort(results.begin(), results.end(), IsBetterResult);
185 
186   for (auto& result : results) {
187     matches_.push_back(std::move(result.first));
188   }
189 }
190 
BestString() const191 std::string DetokenizedString::BestString() const {
192   return matches_.empty() ? std::string() : matches_[0].value();
193 }
194 
BestStringWithErrors() const195 std::string DetokenizedString::BestStringWithErrors() const {
196   if (matches_.empty()) {
197     return has_token_ ? UnknownTokenMessage(token_)
198                       : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
199   }
200   return matches_[0].value_with_errors();
201 }
202 
Detokenizer(const TokenDatabase & database)203 Detokenizer::Detokenizer(const TokenDatabase& database) {
204   for (const auto& entry : database) {
205     database_[entry.token].emplace_back(entry.string, entry.date_removed);
206   }
207 }
208 
FromElfSection(span<const std::byte> elf_section)209 Result<Detokenizer> Detokenizer::FromElfSection(
210     span<const std::byte> elf_section) {
211   size_t index = 0;
212   std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
213 
214   while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
215     _pw_tokenizer_EntryHeader header;
216     std::memcpy(
217         &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
218     index += sizeof(_pw_tokenizer_EntryHeader);
219 
220     if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
221       return Status::DataLoss();
222     }
223 
224     index += header.domain_length;
225     if (index + header.string_length <= elf_section.size()) {
226       // TODO(b/326365218): Construct FormatString with string_view to avoid
227       // creating a copy here.
228       std::string entry(
229           reinterpret_cast<const char*>(elf_section.data() + index),
230           header.string_length);
231       index += header.string_length;
232       database[header.token].emplace_back(entry.c_str(),
233                                           TokenDatabase::kDateRemovedNever);
234     }
235   }
236   return Detokenizer(std::move(database));
237 }
238 
FromElfFile(stream::SeekableReader & stream)239 Result<Detokenizer> Detokenizer::FromElfFile(stream::SeekableReader& stream) {
240   PW_TRY_ASSIGN(auto reader, pw::elf::ElfReader::FromStream(stream));
241 
242   constexpr auto kTokenSectionName = ".pw_tokenizer.entries";
243   PW_TRY_ASSIGN(std::vector<std::byte> section_data,
244                 reader.ReadSection(kTokenSectionName));
245 
246   return Detokenizer::FromElfSection(section_data);
247 }
248 
Detokenize(const span<const std::byte> & encoded) const249 DetokenizedString Detokenizer::Detokenize(
250     const span<const std::byte>& encoded) const {
251   // The token is missing from the encoded data; there is nothing to do.
252   if (encoded.empty()) {
253     return DetokenizedString();
254   }
255 
256   uint32_t token = bytes::ReadInOrder<uint32_t>(
257       endian::little, encoded.data(), encoded.size());
258 
259   const auto result = database_.find(token);
260 
261   return DetokenizedString(
262       token,
263       result == database_.end() ? span<TokenizedStringEntry>()
264                                 : span(result->second),
265       encoded.size() < sizeof(token) ? span<const std::byte>()
266                                      : encoded.subspan(sizeof(token)));
267 }
268 
DetokenizeBase64Message(std::string_view text) const269 DetokenizedString Detokenizer::DetokenizeBase64Message(
270     std::string_view text) const {
271   std::string buffer(text);
272   buffer.resize(PrefixedBase64DecodeInPlace(buffer));
273   return Detokenize(buffer);
274 }
275 
DetokenizeText(std::string_view text,const unsigned max_passes) const276 std::string Detokenizer::DetokenizeText(std::string_view text,
277                                         const unsigned max_passes) const {
278   NestedMessageDetokenizer detokenizer(*this);
279   detokenizer.Detokenize(text);
280 
281   std::string result;
282   unsigned pass = 1;
283 
284   while (true) {
285     result = detokenizer.Flush();
286     if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
287       break;
288     }
289     detokenizer.Detokenize(result);
290     pass += 1;
291   }
292   return result;
293 }
294 
DecodeOptionallyTokenizedData(const ConstByteSpan & optionally_tokenized_data)295 std::string Detokenizer::DecodeOptionallyTokenizedData(
296     const ConstByteSpan& optionally_tokenized_data) {
297   // Try detokenizing as binary using the best result if available, else use
298   // the input data as a string.
299   const auto result = Detokenize(optionally_tokenized_data);
300   const bool found_matches = !result.matches().empty();
301   // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
302   // process does not encode and decode UTF8 format, it is sufficient to check
303   // if the data is printable ASCII.
304   const std::string data =
305       found_matches
306           ? result.BestString()
307           : std::string(
308                 reinterpret_cast<const char*>(optionally_tokenized_data.data()),
309                 optionally_tokenized_data.size());
310 
311   const bool is_data_printable = IsPrintableAscii(data);
312   if (!found_matches && !is_data_printable) {
313     // Assume the token is unknown or the data is corrupt.
314     std::vector<char> base64_encoding_buffer(
315         Base64EncodedBufferSize(optionally_tokenized_data.size()));
316     const size_t encoded_length = PrefixedBase64Encode(
317         optionally_tokenized_data, span(base64_encoding_buffer));
318     return std::string{base64_encoding_buffer.data(), encoded_length};
319   }
320 
321   // Successfully detokenized, check if the field has more prefixed
322   // base64-encoded tokens.
323   const std::string field = DetokenizeText(data);
324   // If anything detokenized successfully, use that.
325   if (field != data) {
326     return field;
327   }
328 
329   // Attempt to determine whether this is an unknown token or plain text.
330   // Any string with only printable or whitespace characters is plain text.
331   if (found_matches || is_data_printable) {
332     return data;
333   }
334 
335   // Assume this field is tokenized data that could not be decoded.
336   std::vector<char> base64_encoding_buffer(
337       Base64EncodedBufferSize(optionally_tokenized_data.size()));
338   const size_t encoded_length = PrefixedBase64Encode(
339       optionally_tokenized_data, span(base64_encoding_buffer));
340   return std::string{base64_encoding_buffer.data(), encoded_length};
341 }
342 
343 }  // namespace pw::tokenizer
344