1# Copyright 2020 The Pigweed Authors 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); you may not 4# use this file except in compliance with the License. You may obtain a copy of 5# the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12# License for the specific language governing permissions and limitations under 13# the License. 14"""Decodes arguments and formats tokenized messages. 15 16The decode(format_string, encoded_arguments) function provides a simple way to 17format a string with encoded arguments. The FormatString class may also be used. 18 19Missing, truncated, or otherwise corrupted arguments are handled and displayed 20in the resulting string with an error message. 21""" 22 23from __future__ import annotations 24 25from datetime import datetime 26import math 27import re 28import struct 29from typing import ( 30 Iterable, 31 NamedTuple, 32 Match, 33 Sequence, 34) 35 36 37def zigzag_decode(value: int) -> int: 38 """ZigZag decode function from protobuf's wire_format module.""" 39 if not value & 0x1: 40 return value >> 1 41 return (value >> 1) ^ (~0) 42 43 44class FormatSpec: 45 """Represents a format specifier parsed from a printf-style string. 46 47 This implementation is designed to align with the C99 specification, 48 section 7.19.6 49 (https://www.dii.uchile.cl/~daespino/files/Iso_C_1999_definition.pdf). 50 Notably, this specification is slightly different than what is implemented 51 in most compilers due to each compiler choosing to interpret undefined 52 behavior in slightly different ways. Treat the following description as the 53 source of truth. 54 55 This implementation supports: 56 - Overall Format: `%[flags][width][.precision][length][specifier]` 57 - Flags (Zero or More) 58 - `-`: Left-justify within the given field width; Right justification is 59 the default (see Width modifier). 60 - `+`: Forces to preceed the result with a plus or minus sign (`+` or `-`) 61 even for positive numbers. By default, only negative numbers are 62 preceded with a `-` sign. 63 - ` ` (space): If no sign is going to be written, a blank space is 64 inserted before the value. 65 - `#`: Specifies an alternative print syntax should be used. 66 - Used with `o`, `x` or `X` specifiers the value is preceeded with `0`, 67 `0x`, or `0X`, respectively, for values different than zero. 68 - Used with `a`, `A`, `e`, `E`, `f`, `F`, `g`, or `G` it forces the 69 written output to contain a decimal point even if no more digits 70 follow. By default, if no digits follow, no decimal point is written. 71 - `0`: Left-pads the number with zeroes (`0`) instead of spaces when 72 padding is specified (see width sub-specifier). 73 - Width (Optional) 74 - ``(number)``: Minimum number of characters to be printed. If the value 75 to be printed is shorter than this number, the result is 76 padded with blank spaces or `0` if the `0` flag is 77 present. The value is not truncated even if the result is 78 larger. If the value is negative and the `0` flag is 79 present, the `0`s are padded after the `-` symbol. 80 - `*`: The width is not specified in the format string, but as an 81 additional integer value argument preceding the argument that has 82 to be formatted. 83 - Precision (Optional) 84 - `.(number)` 85 - For `d`, `i`, `o`, `u`, `x`, `X`, specifies the minimum number of 86 digits to be written. If the value to be written is shorter than this 87 number, the result is padded with leading zeros. The value is not 88 truncated even if the result is longer. 89 - A precision of `0` means that no character is written for the value 90 `0`. 91 - For `a`, `A`, `e`, `E`, `f`, and `F`, specifies the number of digits 92 to be printed after the decimal point. By default, this is `6`. 93 - For `g` and `G`, specifies the maximum number of significant digits to 94 be printed. 95 - For `s`, specifies the maximum number of characters to be printed. By 96 default all characters are printed until the ending null character is 97 encountered. 98 - If the period is specified without an explicit value for precision, 99 `0` is assumed. 100 - `.*`: The precision is not specified in the format string, but as an 101 additional integer value argument preceding the argument that has 102 to be formatted. 103 - Length (Optional) 104 - `hh`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey 105 the argument will be a `signed char` or `unsigned char`. However, 106 this is largely ignored in the implementation due to it not being 107 necessary for Python or argument decoding (since the argument is 108 always encoded at least as a 32-bit integer). 109 - `h`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey 110 the argument will be a `signed short int` or `unsigned short int`. 111 However, this is largely ignored in the implementation due to it 112 not being necessary for Python or argument decoding (since the 113 argument is always encoded at least as a 32-bit integer). 114 - `l`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey 115 the argument will be a `signed long int` or `unsigned long int`. 116 Also is usable with `c` and `s` to specify that the arguments will 117 be encoded with `wchar_t` values (which isn't different from normal 118 `char` values). However, this is largely ignored in the 119 implementation due to it not being necessary for Python or argument 120 decoding (since the argument is always encoded at least as a 32-bit 121 integer). 122 - `ll`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey 123 the argument will be a `signed long long int` or 124 `unsigned long long int`. This is required to properly decode the 125 argument as a 64-bit integer. 126 - `L`: Usable with `a`, `A`, `e`, `E`, `f`, `F`, `g`, or `G` conversion 127 specifiers applies to a long double argument. However, this is 128 ignored in the implementation due to floating point value encoded 129 that is unaffected by bit width. 130 - `j`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey 131 the argument will be a `intmax_t` or `uintmax_t`. 132 - `z`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey 133 the argument will be a `size_t`. This will force the argument to be 134 decoded as an unsigned integer. 135 - `t`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey 136 the argument will be a `ptrdiff_t`. 137 - If a length modifier is provided for an incorrect specifier, it is 138 ignored. 139 - Specifier (Required) 140 - `d` / `i`: Used for signed decimal integers. 141 - `u`: Used for unsigned decimal integers. 142 - `o`: Used for unsigned decimal integers and specifies formatting should 143 be as an octal number. 144 - `x`: Used for unsigned decimal integers and specifies formatting should 145 be as a hexadecimal number using all lowercase letters. 146 - `X`: Used for unsigned decimal integers and specifies formatting should 147 be as a hexadecimal number using all uppercase letters. 148 - `f`: Used for floating-point values and specifies to use lowercase, 149 decimal floating point formatting. 150 - Default precision is `6` decimal places unless explicitly specified. 151 - `F`: Used for floating-point values and specifies to use uppercase, 152 decimal floating point formatting. 153 - Default precision is `6` decimal places unless explicitly specified. 154 - `e`: Used for floating-point values and specifies to use lowercase, 155 exponential (scientific) formatting. 156 - Default precision is `6` decimal places unless explicitly specified. 157 - `E`: Used for floating-point values and specifies to use uppercase, 158 exponential (scientific) formatting. 159 - Default precision is `6` decimal places unless explicitly specified. 160 - `g`: Used for floating-point values and specified to use `f` or `e` 161 formatting depending on which would be the shortest representation. 162 - Precision specifies the number of significant digits, not just digits 163 after the decimal place. 164 - If the precision is specified as `0`, it is interpreted to mean `1`. 165 - `e` formatting is used if the the exponent would be less than `-4` or 166 is greater than or equal to the precision. 167 - Trailing zeros are removed unless the `#` flag is set. 168 - A decimal point only appears if it is followed by a digit. 169 - `NaN` or infinities always follow `f` formatting. 170 - `G`: Used for floating-point values and specified to use `f` or `e` 171 formatting depending on which would be the shortest representation. 172 - Precision specifies the number of significant digits, not just digits 173 after the decimal place. 174 - If the precision is specified as `0`, it is interpreted to mean `1`. 175 - `E` formatting is used if the the exponent would be less than `-4` or 176 is greater than or equal to the precision. 177 - Trailing zeros are removed unless the `#` flag is set. 178 - A decimal point only appears if it is followed by a digit. 179 - `NaN` or infinities always follow `F` formatting. 180 - `c`: Used for formatting a `char` value. 181 - `s`: Used for formatting a string of `char` values. 182 - If width is specified, the null terminator character is included as a 183 character for width count. 184 - If precision is specified, no more `char`s than that value will be 185 written from the string (padding is used to fill additional width). 186 - `p`: Used for formatting a pointer address. 187 - `%`: Prints a single `%`. Only valid as `%%` (supports no flags, width, 188 precision, or length modifiers). 189 190 Underspecified details: 191 - If both `+` and ` ` flags appear, the ` ` is ignored. 192 - The `+` and ` ` flags will error if used with `c` or `s`. 193 - The `#` flag will error if used with `d`, `i`, `u`, `c`, `s`, or `p`. 194 - The `0` flag will error if used with `c`, `s`, or `p`. 195 - Both `+` and ` ` can work with the unsigned integer specifiers `u`, `o`, 196 `x`, and `X`. 197 - If a length modifier is provided for an incorrect specifier, it is 198 ignored. 199 - The `z` length modifier will decode arugments as signed as long as `d` or 200 `i` is used. 201 - `p` is implementation defined. For this implementation, it will print 202 with a `0x` prefix and then the pointer value was printed using `%08X`. 203 `p` supports the `+`, `-`, and ` ` flags, but not the `#` or `0` flags. 204 None of the length modifiers are usable with `p`. This implementation will 205 try to adhere to user-specified width (assuming the width provided is 206 larger than the guaranteed minimum of 10). Specifying precision for `p` is 207 considered an error. 208 - Only `%%` is allowed with no other modifiers. Things like `%+%` will fail 209 to decode. Some C stdlib implementations support any modifiers being 210 present between `%`, but ignore any for the output. 211 - If a width is specified with the `0` flag for a negative value, the padded 212 `0`s will appear after the `-` symbol. 213 - A precision of `0` for `d`, `i`, `u`, `o`, `x`, or `X` means that no 214 character is written for the value `0`. 215 - Precision cannot be specified for `c`. 216 - Using `*` or fixed precision with the `s` specifier still requires the 217 string argument to be null-terminated. This is due to argument encoding 218 happening on the C/C++-side while the precision value is not read or 219 otherwise used until decoding happens in this Python code. 220 221 Non-conformant details: 222 - `n` specifier: We do not support the `n` specifier since it is impossible 223 for us to retroactively tell the original program how many 224 characters have been printed since this decoding happens a 225 great deal of time after the device sent it, usually on a 226 separate processing device entirely. 227 """ 228 229 # Regular expression for finding format specifiers. 230 FORMAT_SPEC = re.compile( 231 r'%(?P<flags>[+\- #0]+)?' 232 r'(?P<width>\d+|\*)?' 233 r'(?P<precision>\.(?:\d*|\*))?' 234 r'(?P<length>hh|h|ll|l|j|z|t|L)?' 235 r'(?P<type>[csdioxXufFeEaAgGnp%])' 236 ) 237 238 # Conversions to make format strings Python compatible. 239 _REMAP_TYPE = {'a': 'f', 'A': 'F', 'p': 'X'} 240 241 # Conversion specifiers by type; n is not supported. 242 SIGNED_INT = frozenset('di') 243 UNSIGNED_INT = frozenset('oxXup') 244 FLOATING_POINT = frozenset('fFeEaAgG') 245 246 _PACKED_FLOAT = struct.Struct('<f') 247 248 @classmethod 249 def from_string(cls, format_specifier: str): 250 """Creates a FormatSpec from a str with a single format specifier.""" 251 match = cls.FORMAT_SPEC.fullmatch(format_specifier) 252 253 if not match: 254 raise ValueError( 255 '{!r} is not a valid single format specifier'.format( 256 format_specifier 257 ) 258 ) 259 260 return cls(match) 261 262 def __init__(self, re_match: Match): 263 """Constructs a FormatSpec from an re.Match object for FORMAT_SPEC.""" 264 self.match = re_match 265 self.specifier: str = self.match.group() 266 267 self.flags: str = self.match.group('flags') or '' 268 self.width: str = self.match.group('width') or '' 269 self.precision: str = self.match.group('precision') or '' 270 self.length: str = self.match.group('length') or '' 271 self.type: str = self.match.group('type') 272 273 self.error = None 274 if self.type == 'n': 275 self.error = 'Unsupported conversion specifier n.' 276 elif self.type == '%': 277 if self.flags or self.width or self.precision or self.length: 278 self.error = ( 279 '%% does not support any flags, width, precision,' 280 'or length modifiers.' 281 ) 282 elif self.type in 'csdiup' and '#' in self.flags: 283 self.error = ( 284 '# is only supported with o, x, X, f, F, e, E, a, A, ' 285 'g, and G specifiers.' 286 ) 287 elif self.type in 'csp' and '0' in self.flags: 288 self.error = ( 289 '0 is only supported with d, i, o, u, x, X, a, A, e, ' 290 'E, f, F, g, and G specifiers.' 291 ) 292 elif self.type in 'cs' and ('+' in self.flags or ' ' in self.flags): 293 self.error = ( 294 '+ and space are only available for d, i, o, u, x, X,' 295 'a, A, e, E, f, F, g, and G specifiers.' 296 ) 297 elif self.type == 'c': 298 if self.precision != '': 299 self.error = 'Precision is not supported for specifier c.' 300 elif self.type == 'p': 301 if self.length != '': 302 self.error = 'p does not support any length modifiers.' 303 elif self.precision != '': 304 self.error = 'p does not support precision modifiers.' 305 306 # If we are going to add additional characters to the output, we add to 307 # width_bias to ensure user-provided widths are reduced by that amount. 308 self._width_bias = 0 309 # Some of our machinery requires that we maintain a minimum precision 310 # width to ensure a certain amount of digits gets printed. This 311 # increases the user-provided precision in these cases if it was not 312 # enough. 313 self._minimum_precision = 0 314 # Python's handling of %#o is non-standard and prepends a 0o 315 # instead of single 0. 316 if self.type == 'o' and '#' in self.flags: 317 self._width_bias = 1 318 # Python does not support %p natively. 319 if self.type == 'p': 320 self._width_bias = 2 321 self._minimum_precision = 8 322 323 # If we have a concrete width, we reduce it by any width bias. 324 # Otherwise, we either have no width or width is *, where the decoding 325 # logic will handle the width bias. 326 parsed_width = int(self.width.replace('*', '') or '0') 327 if parsed_width > self._width_bias: 328 self.width = f'{parsed_width - self._width_bias}' 329 330 # Python %-operator does not support `.` without a 331 # trailing number. `.` is defined to be equivalent to `.0`. 332 if self.precision == '.': 333 self.precision = '.0' 334 335 # If we have a concrete precision that is not *, we check that it is at 336 # least minimum precision. If it is *, other parts of decoding will 337 # ensure the minimum is upheld. 338 if ( 339 self.precision != '.*' 340 and int(self.precision.replace('.', '') or '0') 341 < self._minimum_precision 342 ): 343 self.precision = f'.{self._minimum_precision}' 344 345 # The Python %-format machinery never requires the length 346 # modifier to work correctly, and it doesn't support all of the 347 # C99 length format specifiers anyway. We remove it from the 348 # python-compaitble format string. 349 self.compatible = ''.join( 350 [ 351 '%', 352 self.flags, 353 self.width, 354 self.precision, 355 self._REMAP_TYPE.get(self.type, self.type), 356 ] 357 ) 358 359 def decode(self, encoded_arg: bytes) -> DecodedArg: 360 """Decodes the provided data according to this format specifier.""" 361 if self.error is not None: 362 return DecodedArg( 363 self, None, b'', DecodedArg.DECODE_ERROR, self.error 364 ) 365 366 width = None 367 if self.width == '*': 368 width = FormatSpec.from_string('%d').decode(encoded_arg) 369 encoded_arg = encoded_arg[len(width.raw_data) :] 370 371 precision = None 372 if self.precision == '.*': 373 precision = FormatSpec.from_string('%d').decode(encoded_arg) 374 encoded_arg = encoded_arg[len(precision.raw_data) :] 375 376 if self.type == '%': 377 return DecodedArg( 378 self, (), b'' 379 ) # Use () as the value for % formatting. 380 381 if self.type == 's': 382 return self._merge_decoded_args( 383 width, precision, self._decode_string(encoded_arg) 384 ) 385 386 if self.type == 'c': 387 return self._merge_decoded_args( 388 width, precision, self._decode_char(encoded_arg) 389 ) 390 391 if self.type in self.SIGNED_INT: 392 return self._merge_decoded_args( 393 width, precision, self._decode_signed_integer(encoded_arg) 394 ) 395 396 if self.type in self.UNSIGNED_INT: 397 return self._merge_decoded_args( 398 width, precision, self._decode_unsigned_integer(encoded_arg) 399 ) 400 401 if self.type in self.FLOATING_POINT: 402 return self._merge_decoded_args( 403 width, precision, self._decode_float(encoded_arg) 404 ) 405 406 # Should be unreachable. 407 assert False, f'Unhandled format specifier: {self.type}' 408 409 def text_float_safe_compatible(self) -> str: 410 return ''.join( 411 [ 412 '%', 413 self.flags.replace('0', ' '), 414 self.width, 415 self.precision, 416 self._REMAP_TYPE.get(self.type, self.type), 417 ] 418 ) 419 420 def _merge_decoded_args( 421 self, 422 width: DecodedArg | None, 423 precision: DecodedArg | None, 424 main: DecodedArg, 425 ) -> DecodedArg: 426 def merge_optional_str(*args: str | None) -> str | None: 427 return ' '.join(a for a in args if a) or None 428 429 if width is not None and precision is not None: 430 return DecodedArg( 431 main.specifier, 432 ( 433 width.value - self._width_bias, 434 max(precision.value, self._minimum_precision), 435 main.value, 436 ), 437 width.raw_data + precision.raw_data + main.raw_data, 438 width.status | precision.status | main.status, 439 merge_optional_str(width.error, precision.error, main.error), 440 ) 441 442 if width is not None: 443 return DecodedArg( 444 main.specifier, 445 (width.value - self._width_bias, main.value), 446 width.raw_data + main.raw_data, 447 width.status | main.status, 448 merge_optional_str(width.error, main.error), 449 ) 450 451 if precision is not None: 452 return DecodedArg( 453 main.specifier, 454 (max(precision.value, self._minimum_precision), main.value), 455 precision.raw_data + main.raw_data, 456 precision.status | main.status, 457 merge_optional_str(precision.error, main.error), 458 ) 459 460 return main 461 462 def _decode_signed_integer( 463 self, 464 encoded: bytes, 465 ) -> DecodedArg: 466 """Decodes a signed variable-length integer.""" 467 if not encoded: 468 return DecodedArg.missing(self) 469 470 count = 0 471 result = 0 472 shift = 0 473 474 for byte in encoded: 475 count += 1 476 result |= (byte & 0x7F) << shift 477 478 if not byte & 0x80: 479 return DecodedArg( 480 self, 481 zigzag_decode(result), 482 encoded[:count], 483 DecodedArg.OK, 484 ) 485 486 shift += 7 487 if shift >= 64: 488 break 489 490 return DecodedArg( 491 self, 492 None, 493 encoded[:count], 494 DecodedArg.DECODE_ERROR, 495 'Unterminated variable-length integer', 496 ) 497 498 def _decode_unsigned_integer(self, encoded: bytes) -> DecodedArg: 499 """Decodes an unsigned variable-length integer.""" 500 arg = self._decode_signed_integer(encoded) 501 # Since ZigZag encoding is used, unsigned integers must be masked off to 502 # their original bit length. 503 if arg.value is not None: 504 arg.value &= (1 << self.size_bits()) - 1 505 506 return arg 507 508 def _decode_float(self, encoded: bytes) -> DecodedArg: 509 if len(encoded) < 4: 510 return DecodedArg.missing(self) 511 512 return DecodedArg( 513 self, self._PACKED_FLOAT.unpack_from(encoded)[0], encoded[:4] 514 ) 515 516 def _decode_string(self, encoded: bytes) -> DecodedArg: 517 """Reads a unicode string from the encoded data.""" 518 if not encoded: 519 return DecodedArg.missing(self) 520 521 size_and_status = encoded[0] 522 status = DecodedArg.OK 523 524 if size_and_status & 0x80: 525 status |= DecodedArg.TRUNCATED 526 size_and_status &= 0x7F 527 528 raw_data = encoded[0 : size_and_status + 1] 529 data = raw_data[1:] 530 531 if len(data) < size_and_status: 532 status |= DecodedArg.DECODE_ERROR 533 534 try: 535 decoded = data.decode() 536 except UnicodeDecodeError as err: 537 return DecodedArg( 538 self, 539 repr(bytes(data)).lstrip('b'), 540 raw_data, 541 status | DecodedArg.DECODE_ERROR, 542 err, 543 ) 544 545 return DecodedArg(self, decoded, raw_data, status) 546 547 def _decode_char(self, encoded: bytes) -> DecodedArg: 548 """Reads an integer from the data, then converts it to a string.""" 549 arg = self._decode_signed_integer(encoded) 550 551 if arg.ok(): 552 try: 553 arg.value = chr(arg.value) 554 except (OverflowError, ValueError) as err: 555 arg.error = err 556 arg.status |= DecodedArg.DECODE_ERROR 557 558 return arg 559 560 def size_bits(self) -> int: 561 """Size of the argument in bits; 0 for strings.""" 562 if self.type == 's': 563 return 0 564 565 # TODO(hepler): 64-bit targets likely have 64-bit l, j, z, and t. 566 return 64 if self.length in ['ll', 'j'] else 32 567 568 def __str__(self) -> str: 569 return self.specifier 570 571 572class DecodedArg: 573 """Represents a decoded argument that is ready to be formatted.""" 574 575 # Status flags for a decoded argument. These values should match the 576 # DecodingStatus enum in pw_tokenizer/internal/decode.h. 577 OK = 0 # decoding was successful 578 MISSING = 1 # the argument was not present in the data 579 TRUNCATED = 2 # the argument was truncated during encoding 580 DECODE_ERROR = 4 # an error occurred while decoding the argument 581 SKIPPED = 8 # argument was skipped due to a previous error 582 583 @classmethod 584 def missing(cls, specifier: FormatSpec): 585 return cls(specifier, None, b'', cls.MISSING) 586 587 def __init__( 588 self, 589 specifier: FormatSpec, 590 value, 591 raw_data: bytes, 592 status: int = OK, 593 error=None, 594 ): 595 self.specifier = specifier # FormatSpec (e.g. to represent "%0.2f") 596 self.value = value # the decoded value, or None if decoding failed 597 self.raw_data = bytes( 598 raw_data 599 ) # the exact bytes used to decode this arg 600 self._status = status 601 self.error = error 602 603 def ok(self) -> bool: 604 """The argument was decoded without errors.""" 605 return self.status == self.OK or self.status == self.TRUNCATED 606 607 @property 608 def status(self) -> int: 609 return self._status 610 611 @status.setter 612 def status(self, status: int): 613 # The %% specifier is always OK and should always be printed normally. 614 self._status = status if self.specifier.type != '%' else self.OK 615 616 def format(self) -> str: 617 """Returns formatted version of this argument, with error handling.""" 618 if self.status == self.TRUNCATED: 619 return self.specifier.compatible % (self.value + '[...]') 620 621 if self.ok(): 622 # Check if we are effectively .0{diuoxX} with a 0 value (this 623 # includes .* with (0, 0)). C standard says a value of 0 with 0 624 # precision produces an empty string. 625 is_integer_specifier_type = self.specifier.type in 'diuoxX' 626 is_simple_0_precision_with_0_value = self.value == 0 and ( 627 self.specifier.precision == '.0' 628 or self.specifier.precision == '.' 629 ) 630 is_star_0_precision_with_0_value = ( 631 self.value == (0, 0) and self.specifier.precision == '.*' 632 ) 633 if is_integer_specifier_type and ( 634 is_simple_0_precision_with_0_value 635 or is_star_0_precision_with_0_value 636 ): 637 return '' 638 639 try: 640 # Python has a nonstandard alternative octal form. 641 if self.specifier.type == 'o' and '#' in self.specifier.flags: 642 return self._format_alternative_octal() 643 644 # Python doesn't pad zeros correctly for inf/nan. 645 if self.specifier.type in FormatSpec.FLOATING_POINT and ( 646 self.value == math.inf 647 or self.value == -math.inf 648 or self.value == math.nan 649 ): 650 return self._format_text_float() 651 652 # Python doesn't have a native pointer formatter. 653 if self.specifier.type == 'p': 654 return self._format_pointer() 655 656 return self.specifier.compatible % self.value 657 except (OverflowError, TypeError, ValueError) as err: 658 self._status |= self.DECODE_ERROR 659 self.error = err 660 661 if self.status & self.SKIPPED: 662 message = '{} SKIPPED'.format(self.specifier) 663 elif self.status == self.MISSING: 664 message = '{} MISSING'.format(self.specifier) 665 elif self.status & self.DECODE_ERROR: 666 message = '{} ERROR'.format(self.specifier) 667 else: 668 raise AssertionError( 669 'Unhandled DecodedArg status {:x}!'.format(self.status) 670 ) 671 672 if self.value is None or not str(self.value): 673 return '<[{}]>'.format(message) 674 675 return '<[{} ({})]>'.format(message, self.value) 676 677 def _format_alternative_octal(self) -> str: 678 """Formats an alternative octal specifier. 679 680 This potentially throws OverflowError, TypeError, or ValueError. 681 """ 682 compatible_specifier = self.specifier.compatible.replace('#', '') 683 result = compatible_specifier % self.value 684 685 # Find index of the first non-space, non-plus, and non-zero 686 # character. If we cannot find anything, we will simply 687 # prepend a 0 to the formatted string. 688 counter = 0 689 for i, value in enumerate(result): 690 if value not in ' +0': 691 counter = i 692 break 693 return result[:counter] + '0' + result[counter:] 694 695 def _format_text_float(self) -> str: 696 """Formats a float specifier with txt value (e.g. NAN, INF). 697 698 This potentially throws OverflowError, TypeError, or ValueError. 699 """ 700 return self.specifier.text_float_safe_compatible() % self.value 701 702 def _format_pointer(self) -> str: 703 """Formats a pointer specifier. 704 705 This potentially throws OverflowError, TypeError, or ValueError. 706 """ 707 result = self.specifier.compatible % self.value 708 709 # Find index of the first non-space, non-plus, and non-zero 710 # character (unless we hit the first of the 8 required hex 711 # digits). 712 counter = 0 713 for i, value in enumerate(result[:-7]): 714 if value not in ' +0' or i == len(result) - 8: 715 counter = i 716 break 717 718 # Insert the pointer 0x prefix in after the leading `+`, 719 # space, or `0` 720 return result[:counter] + '0x' + result[counter:] 721 722 def __str__(self) -> str: 723 return self.format() 724 725 def __repr__(self) -> str: 726 return f'DecodedArg({self})' 727 728 729def parse_format_specifiers(format_string: str) -> Iterable[FormatSpec]: 730 for spec in FormatSpec.FORMAT_SPEC.finditer(format_string): 731 yield FormatSpec(spec) 732 733 734class FormattedString(NamedTuple): 735 value: str 736 args: Sequence[DecodedArg] 737 remaining: bytes 738 739 def ok(self) -> bool: 740 """Arg data decoded successfully and all expected args were found.""" 741 return all(arg.ok() for arg in self.args) and not self.remaining 742 743 def score(self, date_removed: datetime | None = None) -> tuple: 744 """Returns a key for sorting by how successful a decode was. 745 746 Decoded strings are sorted by whether they 747 748 1. decoded all bytes for all arguments without errors, 749 2. decoded all data, 750 3. have the fewest decoding errors, 751 4. decoded the most arguments successfully, or 752 5. have the most recent removal date, if they were removed. 753 754 This must match the collision resolution logic in detokenize.cc. 755 756 To format a list of FormattedStrings from most to least successful, 757 use sort(key=FormattedString.score, reverse=True). 758 """ 759 return ( 760 self.ok(), # decocoded all data and all expected args were found 761 not self.remaining, # decoded all data 762 -sum(not arg.ok() for arg in self.args), # fewest errors 763 len(self.args), # decoded the most arguments 764 date_removed or datetime.max, 765 ) # most recently present 766 767 768class FormatString: 769 """Represents a printf-style format string.""" 770 771 def __init__(self, format_string: str): 772 """Parses format specifiers in the format string.""" 773 self.format_string = format_string 774 self.specifiers = tuple(parse_format_specifiers(self.format_string)) 775 776 # List of non-specifier string pieces with room for formatted arguments. 777 self._segments = self._parse_string_segments() 778 779 def _parse_string_segments(self) -> list: 780 """Splits the format string by format specifiers.""" 781 if not self.specifiers: 782 return [self.format_string] 783 784 spec_spans = [spec.match.span() for spec in self.specifiers] 785 786 # Start with the part of the format string up to the first specifier. 787 string_pieces = [self.format_string[: spec_spans[0][0]]] 788 789 for (_, end1), (start2, _) in zip(spec_spans[:-1], spec_spans[1:]): 790 string_pieces.append(self.format_string[end1:start2]) 791 792 # Append the format string segment after the last format specifier. 793 string_pieces.append(self.format_string[spec_spans[-1][1] :]) 794 795 # Make a list with spots for the replacements between the string pieces. 796 segments: list = [None] * (len(string_pieces) + len(self.specifiers)) 797 segments[::2] = string_pieces 798 799 return segments 800 801 def decode(self, encoded: bytes) -> tuple[Sequence[DecodedArg], bytes]: 802 """Decodes arguments according to the format string. 803 804 Args: 805 encoded: bytes; the encoded arguments 806 807 Returns: 808 tuple with the decoded arguments and any unparsed data 809 """ 810 decoded_args = [] 811 812 fatal_error = False 813 index = 0 814 815 for spec in self.specifiers: 816 arg = spec.decode(encoded[index:]) 817 818 if fatal_error: 819 # After an error is encountered, continue to attempt to parse 820 # arguments, but mark them all as SKIPPED. If an error occurs, 821 # it's impossible to know if subsequent arguments are valid. 822 arg.status |= DecodedArg.SKIPPED 823 elif not arg.ok(): 824 fatal_error = True 825 826 decoded_args.append(arg) 827 index += len(arg.raw_data) 828 829 return tuple(decoded_args), encoded[index:] 830 831 def format( 832 self, encoded_args: bytes, show_errors: bool = False 833 ) -> FormattedString: 834 """Decodes arguments and formats the string with them. 835 836 Args: 837 encoded_args: the arguments to decode and format the string with 838 show_errors: if True, an error message is used in place of the % 839 conversion specifier when an argument fails to decode 840 841 Returns: 842 tuple with the formatted string, decoded arguments, and remaining data 843 """ 844 # Insert formatted arguments in place of each format specifier. 845 args, remaining = self.decode(encoded_args) 846 847 if show_errors: 848 self._segments[1::2] = (arg.format() for arg in args) 849 else: 850 self._segments[1::2] = ( 851 arg.format() if arg.ok() else arg.specifier.specifier 852 for arg in args 853 ) 854 855 return FormattedString(''.join(self._segments), args, remaining) 856 857 858def decode( 859 format_string: str, encoded_arguments: bytes, show_errors: bool = False 860) -> str: 861 """Decodes arguments and formats them with the provided format string. 862 863 Args: 864 format_string: the printf-style format string 865 encoded_arguments: encoded arguments with which to format 866 format_string; must exclude the 4-byte string token 867 show_errors: if True, an error message is used in place of the % 868 conversion specifier when an argument fails to decode 869 870 Returns: 871 the printf-style formatted string 872 """ 873 return ( 874 FormatString(format_string).format(encoded_arguments, show_errors).value 875 ) 876