1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14
15 #include "pw_tokenizer/detokenize.h"
16
17 #include <algorithm>
18 #include <cctype>
19 #include <cstring>
20 #include <string_view>
21 #include <vector>
22
23 #include "pw_bytes/bit.h"
24 #include "pw_bytes/endian.h"
25 #include "pw_elf/reader.h"
26 #include "pw_result/result.h"
27 #include "pw_status/try.h"
28 #include "pw_tokenizer/base64.h"
29 #include "pw_tokenizer/internal/decode.h"
30 #include "pw_tokenizer/nested_tokenization.h"
31
32 namespace pw::tokenizer {
33 namespace {
34
35 class NestedMessageDetokenizer {
36 public:
NestedMessageDetokenizer(const Detokenizer & detokenizer)37 NestedMessageDetokenizer(const Detokenizer& detokenizer)
38 : detokenizer_(detokenizer) {}
39
Detokenize(std::string_view chunk)40 void Detokenize(std::string_view chunk) {
41 for (char next_char : chunk) {
42 Detokenize(next_char);
43 }
44 }
45
OutputChangedSinceLastCheck()46 bool OutputChangedSinceLastCheck() {
47 const bool changed = output_changed_;
48 output_changed_ = false;
49 return changed;
50 }
51
Detokenize(char next_char)52 void Detokenize(char next_char) {
53 switch (state_) {
54 case kNonMessage:
55 if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
56 message_buffer_.push_back(next_char);
57 state_ = kMessage;
58 } else {
59 output_.push_back(next_char);
60 }
61 break;
62 case kMessage:
63 if (base64::IsValidChar(next_char)) {
64 message_buffer_.push_back(next_char);
65 } else {
66 HandleEndOfMessage();
67 if (next_char == PW_TOKENIZER_NESTED_PREFIX) {
68 message_buffer_.push_back(next_char);
69 } else {
70 output_.push_back(next_char);
71 state_ = kNonMessage;
72 }
73 }
74 break;
75 }
76 }
77
Flush()78 std::string Flush() {
79 if (state_ == kMessage) {
80 HandleEndOfMessage();
81 state_ = kNonMessage;
82 }
83 std::string output(std::move(output_));
84 output_.clear();
85 return output;
86 }
87
88 private:
HandleEndOfMessage()89 void HandleEndOfMessage() {
90 if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_);
91 result.ok()) {
92 output_ += result.BestString();
93 output_changed_ = true;
94 } else {
95 output_ += message_buffer_; // Keep the original if it doesn't decode.
96 }
97 message_buffer_.clear();
98 }
99
100 const Detokenizer& detokenizer_;
101 std::string output_;
102 std::string message_buffer_;
103
104 enum : uint8_t { kNonMessage, kMessage } state_ = kNonMessage;
105 bool output_changed_ = false;
106 };
107
UnknownTokenMessage(uint32_t value)108 std::string UnknownTokenMessage(uint32_t value) {
109 std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
110
111 // Output a hexadecimal version of the token.
112 for (int shift = 28; shift >= 0; shift -= 4) {
113 output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
114 }
115
116 output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
117 return output;
118 }
119
120 // Decoding result with the date removed, for sorting.
121 using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
122
123 // Determines if one result is better than the other if collisions occurred.
124 // Returns true if lhs is preferred over rhs. This logic should match the
125 // collision resolution logic in detokenize.py.
IsBetterResult(const DecodingResult & lhs,const DecodingResult & rhs)126 bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
127 // Favor the result for which decoding succeeded.
128 if (lhs.first.ok() != rhs.first.ok()) {
129 return lhs.first.ok();
130 }
131
132 // Favor the result for which all bytes were decoded.
133 if ((lhs.first.remaining_bytes() == 0u) !=
134 (rhs.first.remaining_bytes() == 0u)) {
135 return lhs.first.remaining_bytes() == 0u;
136 }
137
138 // Favor the result with fewer decoding errors.
139 if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
140 return lhs.first.decoding_errors() < rhs.first.decoding_errors();
141 }
142
143 // Favor the result that successfully decoded the most arguments.
144 if (lhs.first.argument_count() != rhs.first.argument_count()) {
145 return lhs.first.argument_count() > rhs.first.argument_count();
146 }
147
148 // Favor the result that was removed from the database most recently.
149 return lhs.second > rhs.second;
150 }
151
152 // Returns true if all characters in data are printable, space, or if the string
153 // is empty.
IsPrintableAscii(std::string_view data)154 constexpr bool IsPrintableAscii(std::string_view data) {
155 // This follows the logic in pw_tokenizer.decode_optionally_tokenized below:
156 //
157 // if ''.join(text.split()).isprintable():
158 // return text
159 //
160 for (int letter : data) {
161 if (std::isprint(letter) == 0 && std::isspace(letter) == 0) {
162 return false;
163 }
164 }
165 return true;
166 }
167
168 } // namespace
169
DetokenizedString(uint32_t token,const span<const TokenizedStringEntry> & entries,const span<const std::byte> & arguments)170 DetokenizedString::DetokenizedString(
171 uint32_t token,
172 const span<const TokenizedStringEntry>& entries,
173 const span<const std::byte>& arguments)
174 : token_(token), has_token_(true) {
175 std::vector<DecodingResult> results;
176
177 for (const auto& [format, date_removed] : entries) {
178 results.push_back(DecodingResult{
179 format.Format(span(reinterpret_cast<const uint8_t*>(arguments.data()),
180 arguments.size())),
181 date_removed});
182 }
183
184 std::sort(results.begin(), results.end(), IsBetterResult);
185
186 for (auto& result : results) {
187 matches_.push_back(std::move(result.first));
188 }
189 }
190
BestString() const191 std::string DetokenizedString::BestString() const {
192 return matches_.empty() ? std::string() : matches_[0].value();
193 }
194
BestStringWithErrors() const195 std::string DetokenizedString::BestStringWithErrors() const {
196 if (matches_.empty()) {
197 return has_token_ ? UnknownTokenMessage(token_)
198 : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
199 }
200 return matches_[0].value_with_errors();
201 }
202
Detokenizer(const TokenDatabase & database)203 Detokenizer::Detokenizer(const TokenDatabase& database) {
204 for (const auto& entry : database) {
205 database_[entry.token].emplace_back(entry.string, entry.date_removed);
206 }
207 }
208
FromElfSection(span<const std::byte> elf_section)209 Result<Detokenizer> Detokenizer::FromElfSection(
210 span<const std::byte> elf_section) {
211 size_t index = 0;
212 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database;
213
214 while (index + sizeof(_pw_tokenizer_EntryHeader) < elf_section.size()) {
215 _pw_tokenizer_EntryHeader header;
216 std::memcpy(
217 &header, elf_section.data() + index, sizeof(_pw_tokenizer_EntryHeader));
218 index += sizeof(_pw_tokenizer_EntryHeader);
219
220 if (header.magic != _PW_TOKENIZER_ENTRY_MAGIC) {
221 return Status::DataLoss();
222 }
223
224 index += header.domain_length;
225 if (index + header.string_length <= elf_section.size()) {
226 // TODO(b/326365218): Construct FormatString with string_view to avoid
227 // creating a copy here.
228 std::string entry(
229 reinterpret_cast<const char*>(elf_section.data() + index),
230 header.string_length);
231 index += header.string_length;
232 database[header.token].emplace_back(entry.c_str(),
233 TokenDatabase::kDateRemovedNever);
234 }
235 }
236 return Detokenizer(std::move(database));
237 }
238
FromElfFile(stream::SeekableReader & stream)239 Result<Detokenizer> Detokenizer::FromElfFile(stream::SeekableReader& stream) {
240 PW_TRY_ASSIGN(auto reader, pw::elf::ElfReader::FromStream(stream));
241
242 constexpr auto kTokenSectionName = ".pw_tokenizer.entries";
243 PW_TRY_ASSIGN(std::vector<std::byte> section_data,
244 reader.ReadSection(kTokenSectionName));
245
246 return Detokenizer::FromElfSection(section_data);
247 }
248
Detokenize(const span<const std::byte> & encoded) const249 DetokenizedString Detokenizer::Detokenize(
250 const span<const std::byte>& encoded) const {
251 // The token is missing from the encoded data; there is nothing to do.
252 if (encoded.empty()) {
253 return DetokenizedString();
254 }
255
256 uint32_t token = bytes::ReadInOrder<uint32_t>(
257 endian::little, encoded.data(), encoded.size());
258
259 const auto result = database_.find(token);
260
261 return DetokenizedString(
262 token,
263 result == database_.end() ? span<TokenizedStringEntry>()
264 : span(result->second),
265 encoded.size() < sizeof(token) ? span<const std::byte>()
266 : encoded.subspan(sizeof(token)));
267 }
268
DetokenizeBase64Message(std::string_view text) const269 DetokenizedString Detokenizer::DetokenizeBase64Message(
270 std::string_view text) const {
271 std::string buffer(text);
272 buffer.resize(PrefixedBase64DecodeInPlace(buffer));
273 return Detokenize(buffer);
274 }
275
DetokenizeText(std::string_view text,const unsigned max_passes) const276 std::string Detokenizer::DetokenizeText(std::string_view text,
277 const unsigned max_passes) const {
278 NestedMessageDetokenizer detokenizer(*this);
279 detokenizer.Detokenize(text);
280
281 std::string result;
282 unsigned pass = 1;
283
284 while (true) {
285 result = detokenizer.Flush();
286 if (pass >= max_passes || !detokenizer.OutputChangedSinceLastCheck()) {
287 break;
288 }
289 detokenizer.Detokenize(result);
290 pass += 1;
291 }
292 return result;
293 }
294
DecodeOptionallyTokenizedData(const ConstByteSpan & optionally_tokenized_data)295 std::string Detokenizer::DecodeOptionallyTokenizedData(
296 const ConstByteSpan& optionally_tokenized_data) {
297 // Try detokenizing as binary using the best result if available, else use
298 // the input data as a string.
299 const auto result = Detokenize(optionally_tokenized_data);
300 const bool found_matches = !result.matches().empty();
301 // Note: unlike pw_tokenizer.proto.decode_optionally_tokenized, this decoding
302 // process does not encode and decode UTF8 format, it is sufficient to check
303 // if the data is printable ASCII.
304 const std::string data =
305 found_matches
306 ? result.BestString()
307 : std::string(
308 reinterpret_cast<const char*>(optionally_tokenized_data.data()),
309 optionally_tokenized_data.size());
310
311 const bool is_data_printable = IsPrintableAscii(data);
312 if (!found_matches && !is_data_printable) {
313 // Assume the token is unknown or the data is corrupt.
314 std::vector<char> base64_encoding_buffer(
315 Base64EncodedBufferSize(optionally_tokenized_data.size()));
316 const size_t encoded_length = PrefixedBase64Encode(
317 optionally_tokenized_data, span(base64_encoding_buffer));
318 return std::string{base64_encoding_buffer.data(), encoded_length};
319 }
320
321 // Successfully detokenized, check if the field has more prefixed
322 // base64-encoded tokens.
323 const std::string field = DetokenizeText(data);
324 // If anything detokenized successfully, use that.
325 if (field != data) {
326 return field;
327 }
328
329 // Attempt to determine whether this is an unknown token or plain text.
330 // Any string with only printable or whitespace characters is plain text.
331 if (found_matches || is_data_printable) {
332 return data;
333 }
334
335 // Assume this field is tokenized data that could not be decoded.
336 std::vector<char> base64_encoding_buffer(
337 Base64EncodedBufferSize(optionally_tokenized_data.size()));
338 const size_t encoded_length = PrefixedBase64Encode(
339 optionally_tokenized_data, span(base64_encoding_buffer));
340 return std::string{base64_encoding_buffer.data(), encoded_length};
341 }
342
343 } // namespace pw::tokenizer
344