xref: /aosp_15_r20/external/pigweed/pw_tokenizer/py/pw_tokenizer/decode.py (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1# Copyright 2020 The Pigweed Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License"); you may not
4# use this file except in compliance with the License. You may obtain a copy of
5# the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12# License for the specific language governing permissions and limitations under
13# the License.
14"""Decodes arguments and formats tokenized messages.
15
16The decode(format_string, encoded_arguments) function provides a simple way to
17format a string with encoded arguments. The FormatString class may also be used.
18
19Missing, truncated, or otherwise corrupted arguments are handled and displayed
20in the resulting string with an error message.
21"""
22
23from __future__ import annotations
24
25from datetime import datetime
26import math
27import re
28import struct
29from typing import (
30    Iterable,
31    NamedTuple,
32    Match,
33    Sequence,
34)
35
36
37def zigzag_decode(value: int) -> int:
38    """ZigZag decode function from protobuf's wire_format module."""
39    if not value & 0x1:
40        return value >> 1
41    return (value >> 1) ^ (~0)
42
43
44class FormatSpec:
45    """Represents a format specifier parsed from a printf-style string.
46
47    This implementation is designed to align with the C99 specification,
48    section 7.19.6
49    (https://www.dii.uchile.cl/~daespino/files/Iso_C_1999_definition.pdf).
50    Notably, this specification is slightly different than what is implemented
51    in most compilers due to each compiler choosing to interpret undefined
52    behavior in slightly different ways. Treat the following description as the
53    source of truth.
54
55    This implementation supports:
56    - Overall Format: `%[flags][width][.precision][length][specifier]`
57    - Flags (Zero or More)
58      - `-`: Left-justify within the given field width; Right justification is
59             the default (see Width modifier).
60      - `+`: Forces to preceed the result with a plus or minus sign (`+` or `-`)
61             even for positive numbers. By default, only negative numbers are
62             preceded with a `-` sign.
63      - ` ` (space): If no sign is going to be written, a blank space is
64                     inserted before the value.
65      - `#`: Specifies an alternative print syntax should be used.
66        - Used with `o`, `x` or `X` specifiers the value is preceeded with `0`,
67          `0x`, or `0X`, respectively, for values different than zero.
68        - Used with `a`, `A`, `e`, `E`, `f`, `F`, `g`, or `G` it forces the
69          written output to contain a decimal point even if no more digits
70          follow. By default, if no digits follow, no decimal point is written.
71      - `0`: Left-pads the number with zeroes (`0`) instead of spaces when
72             padding is specified (see width sub-specifier).
73    - Width (Optional)
74      - ``(number)``: Minimum number of characters to be printed. If the value
75                      to be printed is shorter than this number, the result is
76                      padded with blank spaces or `0` if the `0` flag is
77                      present. The value is not truncated even if the result is
78                      larger. If the value is negative and the `0` flag is
79                      present, the `0`s are padded after the `-` symbol.
80      - `*`: The width is not specified in the format string, but as an
81             additional integer value argument preceding the argument that has
82             to be formatted.
83    - Precision (Optional)
84      - `.(number)`
85        - For `d`, `i`, `o`, `u`, `x`, `X`, specifies the minimum number of
86          digits to be written. If the value to be written is shorter than this
87          number, the result is padded with leading zeros. The value is not
88          truncated even if the result is longer.
89          - A precision of `0` means that no character is written for the value
90            `0`.
91        - For `a`, `A`, `e`, `E`, `f`, and `F`, specifies the number of digits
92          to be printed after the decimal point. By default, this is `6`.
93        - For `g` and `G`, specifies the maximum number of significant digits to
94          be printed.
95        - For `s`, specifies the maximum number of characters to be printed. By
96          default all characters are printed until the ending null character is
97          encountered.
98        - If the period is specified without an explicit value for precision,
99          `0` is assumed.
100      - `.*`: The precision is not specified in the format string, but as an
101              additional integer value argument preceding the argument that has
102              to be formatted.
103    - Length (Optional)
104      - `hh`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey
105              the argument will be a `signed char` or `unsigned char`. However,
106              this is largely ignored in the implementation due to it not being
107              necessary for Python or argument decoding (since the argument is
108              always encoded at least as a 32-bit integer).
109      - `h`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey
110             the argument will be a `signed short int` or `unsigned short int`.
111             However, this is largely ignored in the implementation due to it
112             not being necessary for Python or argument decoding (since the
113             argument is always encoded at least as a 32-bit integer).
114      - `l`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey
115             the argument will be a `signed long int` or `unsigned long int`.
116             Also is usable with `c` and `s` to specify that the arguments will
117             be encoded with `wchar_t` values (which isn't different from normal
118             `char` values). However, this is largely ignored in the
119             implementation due to it not being necessary for Python or argument
120             decoding (since the argument is always encoded at least as a 32-bit
121             integer).
122      - `ll`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey
123              the argument will be a `signed long long int` or
124              `unsigned long long int`. This is required to properly decode the
125              argument as a 64-bit integer.
126      - `L`: Usable with `a`, `A`, `e`, `E`, `f`, `F`, `g`, or `G` conversion
127             specifiers applies to a long double argument. However, this is
128             ignored in the implementation due to floating point value encoded
129             that is unaffected by bit width.
130      - `j`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey
131             the argument will be a `intmax_t` or `uintmax_t`.
132      - `z`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey
133             the argument will be a `size_t`. This will force the argument to be
134             decoded as an unsigned integer.
135      - `t`: Usable with `d`, `i`, `o`, `u`, `x`, or `X` specifiers to convey
136             the argument will be a `ptrdiff_t`.
137      - If a length modifier is provided for an incorrect specifier, it is
138        ignored.
139    - Specifier (Required)
140      - `d` / `i`: Used for signed decimal integers.
141      - `u`: Used for unsigned decimal integers.
142      - `o`: Used for unsigned decimal integers and specifies formatting should
143             be as an octal number.
144      - `x`: Used for unsigned decimal integers and specifies formatting should
145             be as a hexadecimal number using all lowercase letters.
146      - `X`: Used for unsigned decimal integers and specifies formatting should
147             be as a hexadecimal number using all uppercase letters.
148      - `f`: Used for floating-point values and specifies to use lowercase,
149             decimal floating point formatting.
150        - Default precision is `6` decimal places unless explicitly specified.
151      - `F`: Used for floating-point values and specifies to use uppercase,
152             decimal floating point formatting.
153        - Default precision is `6` decimal places unless explicitly specified.
154      - `e`: Used for floating-point values and specifies to use lowercase,
155             exponential (scientific) formatting.
156        - Default precision is `6` decimal places unless explicitly specified.
157      - `E`: Used for floating-point values and specifies to use uppercase,
158             exponential (scientific) formatting.
159        - Default precision is `6` decimal places unless explicitly specified.
160      - `g`: Used for floating-point values and specified to use `f` or `e`
161             formatting depending on which would be the shortest representation.
162        - Precision specifies the number of significant digits, not just digits
163          after the decimal place.
164        - If the precision is specified as `0`, it is interpreted to mean `1`.
165        - `e` formatting is used if the the exponent would be less than `-4` or
166          is greater than or equal to the precision.
167        - Trailing zeros are removed unless the `#` flag is set.
168        - A decimal point only appears if it is followed by a digit.
169        - `NaN` or infinities always follow `f` formatting.
170      - `G`: Used for floating-point values and specified to use `f` or `e`
171             formatting depending on which would be the shortest representation.
172        - Precision specifies the number of significant digits, not just digits
173          after the decimal place.
174        - If the precision is specified as `0`, it is interpreted to mean `1`.
175        - `E` formatting is used if the the exponent would be less than `-4` or
176          is greater than or equal to the precision.
177        - Trailing zeros are removed unless the `#` flag is set.
178        - A decimal point only appears if it is followed by a digit.
179        - `NaN` or infinities always follow `F` formatting.
180      - `c`: Used for formatting a `char` value.
181      - `s`: Used for formatting a string of `char` values.
182        - If width is specified, the null terminator character is included as a
183          character for width count.
184        - If precision is specified, no more `char`s than that value will be
185          written from the string (padding is used to fill additional width).
186      - `p`: Used for formatting a pointer address.
187      - `%`: Prints a single `%`. Only valid as `%%` (supports no flags, width,
188             precision, or length modifiers).
189
190    Underspecified details:
191    - If both `+` and ` ` flags appear, the ` ` is ignored.
192    - The `+` and ` ` flags will error if used with `c` or `s`.
193    - The `#` flag will error if used with `d`, `i`, `u`, `c`, `s`, or `p`.
194    - The `0` flag will error if used with `c`, `s`, or `p`.
195    - Both `+` and ` ` can work with the unsigned integer specifiers `u`, `o`,
196      `x`, and `X`.
197    - If a length modifier is provided for an incorrect specifier, it is
198      ignored.
199    - The `z` length modifier will decode arugments as signed as long as `d` or
200      `i` is used.
201    - `p` is implementation defined. For this implementation, it will print
202      with a `0x` prefix and then the pointer value was printed using `%08X`.
203      `p` supports the `+`, `-`, and ` ` flags, but not the `#` or `0` flags.
204      None of the length modifiers are usable with `p`. This implementation will
205      try to adhere to user-specified width (assuming the width provided is
206      larger than the guaranteed minimum of 10). Specifying precision for `p` is
207      considered an error.
208    - Only `%%` is allowed with no other modifiers. Things like `%+%` will fail
209      to decode. Some C stdlib implementations support any modifiers being
210      present between `%`, but ignore any for the output.
211    - If a width is specified with the `0` flag for a negative value, the padded
212      `0`s will appear after the `-` symbol.
213    - A precision of `0` for `d`, `i`, `u`, `o`, `x`, or `X` means that no
214      character is written for the value `0`.
215    - Precision cannot be specified for `c`.
216    - Using `*` or fixed precision with the `s` specifier still requires the
217      string argument to be null-terminated. This is due to argument encoding
218      happening on the C/C++-side while the precision value is not read or
219      otherwise used until decoding happens in this Python code.
220
221    Non-conformant details:
222    - `n` specifier: We do not support the `n` specifier since it is impossible
223                     for us to retroactively tell the original program how many
224                     characters have been printed since this decoding happens a
225                     great deal of time after the device sent it, usually on a
226                     separate processing device entirely.
227    """
228
229    # Regular expression for finding format specifiers.
230    FORMAT_SPEC = re.compile(
231        r'%(?P<flags>[+\- #0]+)?'
232        r'(?P<width>\d+|\*)?'
233        r'(?P<precision>\.(?:\d*|\*))?'
234        r'(?P<length>hh|h|ll|l|j|z|t|L)?'
235        r'(?P<type>[csdioxXufFeEaAgGnp%])'
236    )
237
238    # Conversions to make format strings Python compatible.
239    _REMAP_TYPE = {'a': 'f', 'A': 'F', 'p': 'X'}
240
241    # Conversion specifiers by type; n is not supported.
242    SIGNED_INT = frozenset('di')
243    UNSIGNED_INT = frozenset('oxXup')
244    FLOATING_POINT = frozenset('fFeEaAgG')
245
246    _PACKED_FLOAT = struct.Struct('<f')
247
248    @classmethod
249    def from_string(cls, format_specifier: str):
250        """Creates a FormatSpec from a str with a single format specifier."""
251        match = cls.FORMAT_SPEC.fullmatch(format_specifier)
252
253        if not match:
254            raise ValueError(
255                '{!r} is not a valid single format specifier'.format(
256                    format_specifier
257                )
258            )
259
260        return cls(match)
261
262    def __init__(self, re_match: Match):
263        """Constructs a FormatSpec from an re.Match object for FORMAT_SPEC."""
264        self.match = re_match
265        self.specifier: str = self.match.group()
266
267        self.flags: str = self.match.group('flags') or ''
268        self.width: str = self.match.group('width') or ''
269        self.precision: str = self.match.group('precision') or ''
270        self.length: str = self.match.group('length') or ''
271        self.type: str = self.match.group('type')
272
273        self.error = None
274        if self.type == 'n':
275            self.error = 'Unsupported conversion specifier n.'
276        elif self.type == '%':
277            if self.flags or self.width or self.precision or self.length:
278                self.error = (
279                    '%% does not support any flags, width, precision,'
280                    'or length modifiers.'
281                )
282        elif self.type in 'csdiup' and '#' in self.flags:
283            self.error = (
284                '# is only supported with o, x, X, f, F, e, E, a, A, '
285                'g, and G specifiers.'
286            )
287        elif self.type in 'csp' and '0' in self.flags:
288            self.error = (
289                '0 is only supported with d, i, o, u, x, X, a, A, e, '
290                'E, f, F, g, and G specifiers.'
291            )
292        elif self.type in 'cs' and ('+' in self.flags or ' ' in self.flags):
293            self.error = (
294                '+ and space are only available for d, i, o, u, x, X,'
295                'a, A, e, E, f, F, g, and G specifiers.'
296            )
297        elif self.type == 'c':
298            if self.precision != '':
299                self.error = 'Precision is not supported for specifier c.'
300        elif self.type == 'p':
301            if self.length != '':
302                self.error = 'p does not support any length modifiers.'
303            elif self.precision != '':
304                self.error = 'p does not support precision modifiers.'
305
306        # If we are going to add additional characters to the output, we add to
307        # width_bias to ensure user-provided widths are reduced by that amount.
308        self._width_bias = 0
309        # Some of our machinery requires that we maintain a minimum precision
310        # width to ensure a certain amount of digits gets printed. This
311        # increases the user-provided precision in these cases if it was not
312        # enough.
313        self._minimum_precision = 0
314        # Python's handling of %#o is non-standard and prepends a 0o
315        # instead of single 0.
316        if self.type == 'o' and '#' in self.flags:
317            self._width_bias = 1
318        # Python does not support %p natively.
319        if self.type == 'p':
320            self._width_bias = 2
321            self._minimum_precision = 8
322
323        # If we have a concrete width, we reduce it by any width bias.
324        # Otherwise, we either have no width or width is *, where the decoding
325        # logic will handle the width bias.
326        parsed_width = int(self.width.replace('*', '') or '0')
327        if parsed_width > self._width_bias:
328            self.width = f'{parsed_width - self._width_bias}'
329
330        # Python %-operator does not support `.` without a
331        # trailing number. `.` is defined to be equivalent to `.0`.
332        if self.precision == '.':
333            self.precision = '.0'
334
335        # If we have a concrete precision that is not *, we check that it is at
336        # least minimum precision. If it is *, other parts of decoding will
337        # ensure the minimum is upheld.
338        if (
339            self.precision != '.*'
340            and int(self.precision.replace('.', '') or '0')
341            < self._minimum_precision
342        ):
343            self.precision = f'.{self._minimum_precision}'
344
345        # The Python %-format machinery never requires the length
346        # modifier to work correctly, and it doesn't support all of the
347        # C99 length format specifiers anyway. We remove it from the
348        # python-compaitble format string.
349        self.compatible = ''.join(
350            [
351                '%',
352                self.flags,
353                self.width,
354                self.precision,
355                self._REMAP_TYPE.get(self.type, self.type),
356            ]
357        )
358
359    def decode(self, encoded_arg: bytes) -> DecodedArg:
360        """Decodes the provided data according to this format specifier."""
361        if self.error is not None:
362            return DecodedArg(
363                self, None, b'', DecodedArg.DECODE_ERROR, self.error
364            )
365
366        width = None
367        if self.width == '*':
368            width = FormatSpec.from_string('%d').decode(encoded_arg)
369            encoded_arg = encoded_arg[len(width.raw_data) :]
370
371        precision = None
372        if self.precision == '.*':
373            precision = FormatSpec.from_string('%d').decode(encoded_arg)
374            encoded_arg = encoded_arg[len(precision.raw_data) :]
375
376        if self.type == '%':
377            return DecodedArg(
378                self, (), b''
379            )  # Use () as the value for % formatting.
380
381        if self.type == 's':
382            return self._merge_decoded_args(
383                width, precision, self._decode_string(encoded_arg)
384            )
385
386        if self.type == 'c':
387            return self._merge_decoded_args(
388                width, precision, self._decode_char(encoded_arg)
389            )
390
391        if self.type in self.SIGNED_INT:
392            return self._merge_decoded_args(
393                width, precision, self._decode_signed_integer(encoded_arg)
394            )
395
396        if self.type in self.UNSIGNED_INT:
397            return self._merge_decoded_args(
398                width, precision, self._decode_unsigned_integer(encoded_arg)
399            )
400
401        if self.type in self.FLOATING_POINT:
402            return self._merge_decoded_args(
403                width, precision, self._decode_float(encoded_arg)
404            )
405
406        # Should be unreachable.
407        assert False, f'Unhandled format specifier: {self.type}'
408
409    def text_float_safe_compatible(self) -> str:
410        return ''.join(
411            [
412                '%',
413                self.flags.replace('0', ' '),
414                self.width,
415                self.precision,
416                self._REMAP_TYPE.get(self.type, self.type),
417            ]
418        )
419
420    def _merge_decoded_args(
421        self,
422        width: DecodedArg | None,
423        precision: DecodedArg | None,
424        main: DecodedArg,
425    ) -> DecodedArg:
426        def merge_optional_str(*args: str | None) -> str | None:
427            return ' '.join(a for a in args if a) or None
428
429        if width is not None and precision is not None:
430            return DecodedArg(
431                main.specifier,
432                (
433                    width.value - self._width_bias,
434                    max(precision.value, self._minimum_precision),
435                    main.value,
436                ),
437                width.raw_data + precision.raw_data + main.raw_data,
438                width.status | precision.status | main.status,
439                merge_optional_str(width.error, precision.error, main.error),
440            )
441
442        if width is not None:
443            return DecodedArg(
444                main.specifier,
445                (width.value - self._width_bias, main.value),
446                width.raw_data + main.raw_data,
447                width.status | main.status,
448                merge_optional_str(width.error, main.error),
449            )
450
451        if precision is not None:
452            return DecodedArg(
453                main.specifier,
454                (max(precision.value, self._minimum_precision), main.value),
455                precision.raw_data + main.raw_data,
456                precision.status | main.status,
457                merge_optional_str(precision.error, main.error),
458            )
459
460        return main
461
462    def _decode_signed_integer(
463        self,
464        encoded: bytes,
465    ) -> DecodedArg:
466        """Decodes a signed variable-length integer."""
467        if not encoded:
468            return DecodedArg.missing(self)
469
470        count = 0
471        result = 0
472        shift = 0
473
474        for byte in encoded:
475            count += 1
476            result |= (byte & 0x7F) << shift
477
478            if not byte & 0x80:
479                return DecodedArg(
480                    self,
481                    zigzag_decode(result),
482                    encoded[:count],
483                    DecodedArg.OK,
484                )
485
486            shift += 7
487            if shift >= 64:
488                break
489
490        return DecodedArg(
491            self,
492            None,
493            encoded[:count],
494            DecodedArg.DECODE_ERROR,
495            'Unterminated variable-length integer',
496        )
497
498    def _decode_unsigned_integer(self, encoded: bytes) -> DecodedArg:
499        """Decodes an unsigned variable-length integer."""
500        arg = self._decode_signed_integer(encoded)
501        # Since ZigZag encoding is used, unsigned integers must be masked off to
502        # their original bit length.
503        if arg.value is not None:
504            arg.value &= (1 << self.size_bits()) - 1
505
506        return arg
507
508    def _decode_float(self, encoded: bytes) -> DecodedArg:
509        if len(encoded) < 4:
510            return DecodedArg.missing(self)
511
512        return DecodedArg(
513            self, self._PACKED_FLOAT.unpack_from(encoded)[0], encoded[:4]
514        )
515
516    def _decode_string(self, encoded: bytes) -> DecodedArg:
517        """Reads a unicode string from the encoded data."""
518        if not encoded:
519            return DecodedArg.missing(self)
520
521        size_and_status = encoded[0]
522        status = DecodedArg.OK
523
524        if size_and_status & 0x80:
525            status |= DecodedArg.TRUNCATED
526            size_and_status &= 0x7F
527
528        raw_data = encoded[0 : size_and_status + 1]
529        data = raw_data[1:]
530
531        if len(data) < size_and_status:
532            status |= DecodedArg.DECODE_ERROR
533
534        try:
535            decoded = data.decode()
536        except UnicodeDecodeError as err:
537            return DecodedArg(
538                self,
539                repr(bytes(data)).lstrip('b'),
540                raw_data,
541                status | DecodedArg.DECODE_ERROR,
542                err,
543            )
544
545        return DecodedArg(self, decoded, raw_data, status)
546
547    def _decode_char(self, encoded: bytes) -> DecodedArg:
548        """Reads an integer from the data, then converts it to a string."""
549        arg = self._decode_signed_integer(encoded)
550
551        if arg.ok():
552            try:
553                arg.value = chr(arg.value)
554            except (OverflowError, ValueError) as err:
555                arg.error = err
556                arg.status |= DecodedArg.DECODE_ERROR
557
558        return arg
559
560    def size_bits(self) -> int:
561        """Size of the argument in bits; 0 for strings."""
562        if self.type == 's':
563            return 0
564
565        # TODO(hepler): 64-bit targets likely have 64-bit l, j, z, and t.
566        return 64 if self.length in ['ll', 'j'] else 32
567
568    def __str__(self) -> str:
569        return self.specifier
570
571
572class DecodedArg:
573    """Represents a decoded argument that is ready to be formatted."""
574
575    # Status flags for a decoded argument. These values should match the
576    # DecodingStatus enum in pw_tokenizer/internal/decode.h.
577    OK = 0  # decoding was successful
578    MISSING = 1  # the argument was not present in the data
579    TRUNCATED = 2  # the argument was truncated during encoding
580    DECODE_ERROR = 4  # an error occurred while decoding the argument
581    SKIPPED = 8  # argument was skipped due to a previous error
582
583    @classmethod
584    def missing(cls, specifier: FormatSpec):
585        return cls(specifier, None, b'', cls.MISSING)
586
587    def __init__(
588        self,
589        specifier: FormatSpec,
590        value,
591        raw_data: bytes,
592        status: int = OK,
593        error=None,
594    ):
595        self.specifier = specifier  # FormatSpec (e.g. to represent "%0.2f")
596        self.value = value  # the decoded value, or None if decoding failed
597        self.raw_data = bytes(
598            raw_data
599        )  # the exact bytes used to decode this arg
600        self._status = status
601        self.error = error
602
603    def ok(self) -> bool:
604        """The argument was decoded without errors."""
605        return self.status == self.OK or self.status == self.TRUNCATED
606
607    @property
608    def status(self) -> int:
609        return self._status
610
611    @status.setter
612    def status(self, status: int):
613        # The %% specifier is always OK and should always be printed normally.
614        self._status = status if self.specifier.type != '%' else self.OK
615
616    def format(self) -> str:
617        """Returns formatted version of this argument, with error handling."""
618        if self.status == self.TRUNCATED:
619            return self.specifier.compatible % (self.value + '[...]')
620
621        if self.ok():
622            # Check if we are effectively .0{diuoxX} with a 0 value (this
623            # includes .* with (0, 0)). C standard says a value of 0 with 0
624            # precision produces an empty string.
625            is_integer_specifier_type = self.specifier.type in 'diuoxX'
626            is_simple_0_precision_with_0_value = self.value == 0 and (
627                self.specifier.precision == '.0'
628                or self.specifier.precision == '.'
629            )
630            is_star_0_precision_with_0_value = (
631                self.value == (0, 0) and self.specifier.precision == '.*'
632            )
633            if is_integer_specifier_type and (
634                is_simple_0_precision_with_0_value
635                or is_star_0_precision_with_0_value
636            ):
637                return ''
638
639            try:
640                # Python has a nonstandard alternative octal form.
641                if self.specifier.type == 'o' and '#' in self.specifier.flags:
642                    return self._format_alternative_octal()
643
644                # Python doesn't pad zeros correctly for inf/nan.
645                if self.specifier.type in FormatSpec.FLOATING_POINT and (
646                    self.value == math.inf
647                    or self.value == -math.inf
648                    or self.value == math.nan
649                ):
650                    return self._format_text_float()
651
652                # Python doesn't have a native pointer formatter.
653                if self.specifier.type == 'p':
654                    return self._format_pointer()
655
656                return self.specifier.compatible % self.value
657            except (OverflowError, TypeError, ValueError) as err:
658                self._status |= self.DECODE_ERROR
659                self.error = err
660
661        if self.status & self.SKIPPED:
662            message = '{} SKIPPED'.format(self.specifier)
663        elif self.status == self.MISSING:
664            message = '{} MISSING'.format(self.specifier)
665        elif self.status & self.DECODE_ERROR:
666            message = '{} ERROR'.format(self.specifier)
667        else:
668            raise AssertionError(
669                'Unhandled DecodedArg status {:x}!'.format(self.status)
670            )
671
672        if self.value is None or not str(self.value):
673            return '<[{}]>'.format(message)
674
675        return '<[{} ({})]>'.format(message, self.value)
676
677    def _format_alternative_octal(self) -> str:
678        """Formats an alternative octal specifier.
679
680        This potentially throws OverflowError, TypeError, or ValueError.
681        """
682        compatible_specifier = self.specifier.compatible.replace('#', '')
683        result = compatible_specifier % self.value
684
685        # Find index of the first non-space, non-plus, and non-zero
686        # character. If we cannot find anything, we will simply
687        # prepend a 0 to the formatted string.
688        counter = 0
689        for i, value in enumerate(result):
690            if value not in ' +0':
691                counter = i
692                break
693        return result[:counter] + '0' + result[counter:]
694
695    def _format_text_float(self) -> str:
696        """Formats a float specifier with txt value (e.g. NAN, INF).
697
698        This potentially throws OverflowError, TypeError, or ValueError.
699        """
700        return self.specifier.text_float_safe_compatible() % self.value
701
702    def _format_pointer(self) -> str:
703        """Formats a pointer specifier.
704
705        This potentially throws OverflowError, TypeError, or ValueError.
706        """
707        result = self.specifier.compatible % self.value
708
709        # Find index of the first non-space, non-plus, and non-zero
710        # character (unless we hit the first of the 8 required hex
711        # digits).
712        counter = 0
713        for i, value in enumerate(result[:-7]):
714            if value not in ' +0' or i == len(result) - 8:
715                counter = i
716                break
717
718        # Insert the pointer 0x prefix in after the leading `+`,
719        # space, or `0`
720        return result[:counter] + '0x' + result[counter:]
721
722    def __str__(self) -> str:
723        return self.format()
724
725    def __repr__(self) -> str:
726        return f'DecodedArg({self})'
727
728
729def parse_format_specifiers(format_string: str) -> Iterable[FormatSpec]:
730    for spec in FormatSpec.FORMAT_SPEC.finditer(format_string):
731        yield FormatSpec(spec)
732
733
734class FormattedString(NamedTuple):
735    value: str
736    args: Sequence[DecodedArg]
737    remaining: bytes
738
739    def ok(self) -> bool:
740        """Arg data decoded successfully and all expected args were found."""
741        return all(arg.ok() for arg in self.args) and not self.remaining
742
743    def score(self, date_removed: datetime | None = None) -> tuple:
744        """Returns a key for sorting by how successful a decode was.
745
746        Decoded strings are sorted by whether they
747
748          1. decoded all bytes for all arguments without errors,
749          2. decoded all data,
750          3. have the fewest decoding errors,
751          4. decoded the most arguments successfully, or
752          5. have the most recent removal date, if they were removed.
753
754        This must match the collision resolution logic in detokenize.cc.
755
756        To format a list of FormattedStrings from most to least successful,
757        use sort(key=FormattedString.score, reverse=True).
758        """
759        return (
760            self.ok(),  # decocoded all data and all expected args were found
761            not self.remaining,  # decoded all data
762            -sum(not arg.ok() for arg in self.args),  # fewest errors
763            len(self.args),  # decoded the most arguments
764            date_removed or datetime.max,
765        )  # most recently present
766
767
768class FormatString:
769    """Represents a printf-style format string."""
770
771    def __init__(self, format_string: str):
772        """Parses format specifiers in the format string."""
773        self.format_string = format_string
774        self.specifiers = tuple(parse_format_specifiers(self.format_string))
775
776        # List of non-specifier string pieces with room for formatted arguments.
777        self._segments = self._parse_string_segments()
778
779    def _parse_string_segments(self) -> list:
780        """Splits the format string by format specifiers."""
781        if not self.specifiers:
782            return [self.format_string]
783
784        spec_spans = [spec.match.span() for spec in self.specifiers]
785
786        # Start with the part of the format string up to the first specifier.
787        string_pieces = [self.format_string[: spec_spans[0][0]]]
788
789        for (_, end1), (start2, _) in zip(spec_spans[:-1], spec_spans[1:]):
790            string_pieces.append(self.format_string[end1:start2])
791
792        # Append the format string segment after the last format specifier.
793        string_pieces.append(self.format_string[spec_spans[-1][1] :])
794
795        # Make a list with spots for the replacements between the string pieces.
796        segments: list = [None] * (len(string_pieces) + len(self.specifiers))
797        segments[::2] = string_pieces
798
799        return segments
800
801    def decode(self, encoded: bytes) -> tuple[Sequence[DecodedArg], bytes]:
802        """Decodes arguments according to the format string.
803
804        Args:
805          encoded: bytes; the encoded arguments
806
807        Returns:
808          tuple with the decoded arguments and any unparsed data
809        """
810        decoded_args = []
811
812        fatal_error = False
813        index = 0
814
815        for spec in self.specifiers:
816            arg = spec.decode(encoded[index:])
817
818            if fatal_error:
819                # After an error is encountered, continue to attempt to parse
820                # arguments, but mark them all as SKIPPED. If an error occurs,
821                # it's impossible to know if subsequent arguments are valid.
822                arg.status |= DecodedArg.SKIPPED
823            elif not arg.ok():
824                fatal_error = True
825
826            decoded_args.append(arg)
827            index += len(arg.raw_data)
828
829        return tuple(decoded_args), encoded[index:]
830
831    def format(
832        self, encoded_args: bytes, show_errors: bool = False
833    ) -> FormattedString:
834        """Decodes arguments and formats the string with them.
835
836        Args:
837          encoded_args: the arguments to decode and format the string with
838          show_errors: if True, an error message is used in place of the %
839              conversion specifier when an argument fails to decode
840
841        Returns:
842          tuple with the formatted string, decoded arguments, and remaining data
843        """
844        # Insert formatted arguments in place of each format specifier.
845        args, remaining = self.decode(encoded_args)
846
847        if show_errors:
848            self._segments[1::2] = (arg.format() for arg in args)
849        else:
850            self._segments[1::2] = (
851                arg.format() if arg.ok() else arg.specifier.specifier
852                for arg in args
853            )
854
855        return FormattedString(''.join(self._segments), args, remaining)
856
857
858def decode(
859    format_string: str, encoded_arguments: bytes, show_errors: bool = False
860) -> str:
861    """Decodes arguments and formats them with the provided format string.
862
863    Args:
864      format_string: the printf-style format string
865      encoded_arguments: encoded arguments with which to format
866          format_string; must exclude the 4-byte string token
867      show_errors: if True, an error message is used in place of the %
868          conversion specifier when an argument fails to decode
869
870    Returns:
871      the printf-style formatted string
872    """
873    return (
874        FormatString(format_string).format(encoded_arguments, show_errors).value
875    )
876