xref: /aosp_15_r20/external/pigweed/pw_tokenizer/py/pw_tokenizer/encode.py (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1# Copyright 2020 The Pigweed Authors
2#
3# Licensed under the Apache License, Version 2.0 (the "License"); you may not
4# use this file except in compliance with the License. You may obtain a copy of
5# the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12# License for the specific language governing permissions and limitations under
13# the License.
14"""Provides functionality for encoding tokenized messages."""
15
16import argparse
17import base64
18import struct
19import sys
20from typing import Sequence
21
22from pw_tokenizer import tokens
23
24_INT32_MAX = 2**31 - 1
25_UINT32_MAX = 2**32 - 1
26NESTED_TOKEN_PREFIX = '$'
27NESTED_TOKEN_BASE_PREFIX = '#'
28NESTED_DOMAIN_START_PREFIX = '{'
29NESTED_DOMAIN_END_PREFIX = '}'
30
31
32def _zig_zag_encode(value: int) -> int:
33    """Encodes signed integers to give a compact varint encoding."""
34    return value << 1 if value >= 0 else (value << 1) ^ (~0)
35
36
37def _little_endian_base128_encode(integer: int) -> bytearray:
38    data = bytearray()
39
40    while True:
41        # Grab 7 bits; the eighth bit is set to 1 to indicate more data coming.
42        data.append((integer & 0x7F) | 0x80)
43        integer >>= 7
44
45        if not integer:
46            break
47
48    data[-1] &= 0x7F  # clear the top bit of the last byte
49    return data
50
51
52def _encode_int32(arg: int) -> bytearray:
53    # Convert large unsigned numbers into their corresponding signed values.
54    if arg > _INT32_MAX:
55        arg -= 2**32
56
57    return _little_endian_base128_encode(_zig_zag_encode(arg))
58
59
60def _encode_string(arg: bytes) -> bytes:
61    size_byte = len(arg) if len(arg) < 128 else 0xFF
62    return struct.pack('B', size_byte) + arg[:127]
63
64
65def encode_args(*args: int | float | bytes | str) -> bytes:
66    """Encodes a list of arguments to their on-wire representation."""
67
68    data = bytearray(b'')
69    for arg in args:
70        if isinstance(arg, int):
71            if arg.bit_length() > 32:
72                raise ValueError(
73                    f'Cannot encode {arg}: only 32-bit integers may be encoded'
74                )
75            data += _encode_int32(arg)
76        elif isinstance(arg, float):
77            data += struct.pack('<f', arg)
78        elif isinstance(arg, str):
79            data += _encode_string(arg.encode())
80        elif isinstance(arg, bytes):
81            data += _encode_string(arg)
82        else:
83            raise ValueError(
84                f'{arg} has type {type(arg)}, which is not supported'
85            )
86    return bytes(data)
87
88
89def encode_token_and_args(
90    token: int, *args: int | float | bytes | str
91) -> bytes:
92    """Encodes a tokenized message given its token and arguments.
93
94    This function assumes that the token represents a format string with
95    conversion specifiers that correspond with the provided argument types.
96    Currently, only 32-bit integers are supported.
97    """
98
99    if token < 0 or token > _UINT32_MAX:
100        raise ValueError(
101            f'The token ({token}) must be an unsigned 32-bit integer'
102        )
103
104    return struct.pack('<I', token) + encode_args(*args)
105
106
107def prefixed_base64(data: bytes, prefix: str | bytes = '$') -> str:
108    """Encodes a tokenized message as prefixed Base64."""
109    prefix = prefix if isinstance(prefix, str) else prefix.decode()
110    return prefix + base64.b64encode(data).decode()
111
112
113def _parse_user_input(string: str):
114    """Evaluates a string as Python code or returns it as a literal string."""
115    try:
116        value = eval(string, dict(__builtins__={}))  # pylint: disable=eval-used
117    except (NameError, SyntaxError):
118        return string
119
120    return value if isinstance(value, (int, float)) else string
121
122
123def _main(format_string_list: Sequence[str], raw_args: Sequence[str]) -> int:
124    (format_string,) = format_string_list
125    token = tokens.pw_tokenizer_65599_hash(format_string)
126    args = tuple(_parse_user_input(a) for a in raw_args)
127
128    data = encode_token_and_args(token, *args)
129    token = int.from_bytes(data[:4], 'little')
130    binary = ' '.join(f'{b:02x}' for b in data)
131
132    print(f'      Raw input: {format_string!r} % {args!r}')
133    print(f'Formatted input: {format_string % args}')
134    print(f'          Token: 0x{token:08x}')
135    print(f'        Encoded: {data!r} ({binary}) [{len(data)} bytes]')
136    print(f'Prefixed Base64: {prefixed_base64(data)}')
137
138    return 0
139
140
141def _parse_args() -> dict:
142    parser = argparse.ArgumentParser(
143        description=__doc__,
144        formatter_class=argparse.RawDescriptionHelpFormatter,
145    )
146    parser.add_argument(
147        'format_string_list',
148        metavar='FORMAT_STRING',
149        nargs=1,
150        help='Format string with optional %%-style arguments.',
151    )
152    parser.add_argument(
153        'raw_args',
154        metavar='ARG',
155        nargs='*',
156        help=(
157            'Arguments for the format string, if any. Arguments are parsed '
158            'as Python expressions, with no builtins (e.g. 9 is the number '
159            '9 and \'"9"\' is the string "9"). Arguments that are not valid '
160            'Python are treated as string literals.'
161        ),
162    )
163    return vars(parser.parse_args())
164
165
166if __name__ == '__main__':
167    sys.exit(_main(**_parse_args()))
168