xref: /aosp_15_r20/external/pigweed/pw_tokenizer/py/pw_tokenizer/elf_reader.py (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1#!/usr/bin/env python3
2# Copyright 2020 The Pigweed Authors
3#
4# Licensed under the Apache License, Version 2.0 (the "License"); you may not
5# use this file except in compliance with the License. You may obtain a copy of
6# the License at
7#
8#     https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13# License for the specific language governing permissions and limitations under
14# the License.
15"""Reads data from ELF sections.
16
17This module provides tools for dumping the contents of an ELF section. It can
18also be used to read values at a particular address. A command line interface
19for both of these features is provided.
20
21This module supports any ELF-format file, including .o and .so files. This
22module also has basic support for archive (.a) files. All ELF files in an
23archive are read as one unit.
24"""
25
26from __future__ import annotations
27
28import argparse
29import collections
30from pathlib import Path
31import re
32import struct
33import sys
34from typing import (
35    BinaryIO,
36    Iterable,
37    Mapping,
38    NamedTuple,
39    Pattern,
40)
41
42ARCHIVE_MAGIC = b'!<arch>\n'
43ELF_MAGIC = b'\x7fELF'
44
45
46def _check_next_bytes(fd: BinaryIO, expected: bytes, what: str) -> None:
47    actual = fd.read(len(expected))
48    if expected != actual:
49        raise FileDecodeError(
50            f'Invalid {what}: expected {expected!r}, found {actual!r} in file '
51            f'{getattr(fd, "name", "(unknown")}'
52        )
53
54
55def files_in_archive(fd: BinaryIO) -> Iterable[int]:
56    """Seeks to each file in an archive and yields its size."""
57
58    _check_next_bytes(fd, ARCHIVE_MAGIC, 'archive magic number')
59
60    while True:
61        # In some archives, the first file ends with an additional \n. If that
62        # is present, skip it.
63        if fd.read(1) != b'\n':
64            fd.seek(-1, 1)
65
66        # Each file in an archive is prefixed with an ASCII header:
67        #
68        #   16 B - file identifier (text)
69        #   12 B - file modification timestamp (decimal)
70        #    6 B - owner ID (decimal)
71        #    6 B - group ID (decimal)
72        #    8 B - file mode (octal)
73        #   10 B - file size in bytes (decimal)
74        #    2 B - ending characters (`\n)
75        #
76        # Skip the unused portions of the file header, then read the size.
77        fd.seek(16 + 12 + 6 + 6 + 8, 1)
78        size_str = fd.read(10)
79        if not size_str:
80            return
81
82        try:
83            size = int(size_str, 10)
84        except ValueError as exc:
85            raise FileDecodeError(
86                'Archive file sizes must be decimal integers'
87            ) from exc
88
89        _check_next_bytes(fd, b'`\n', 'archive file header ending')
90        offset = fd.tell()  # Store offset in case the caller reads the file.
91
92        yield size
93
94        fd.seek(offset + size)
95
96
97def _elf_files_in_archive(fd: BinaryIO):
98    if _bytes_match(fd, ELF_MAGIC):
99        yield  # The value isn't used, so just yield None.
100    else:
101        for _ in files_in_archive(fd):
102            if _bytes_match(fd, ELF_MAGIC):
103                yield
104
105
106class Field(NamedTuple):
107    """A field in an ELF file.
108
109    Fields refer to a particular piece of data in an ELF file or section header.
110    """
111
112    name: str
113    offset_32: int
114    offset_64: int
115    size_32: int
116    size_64: int
117
118
119class _FileHeader(NamedTuple):
120    """Fields in the ELF file header."""
121
122    section_header_offset: Field = Field('e_shoff', 0x20, 0x28, 4, 8)
123    section_count: Field = Field('e_shnum', 0x30, 0x3C, 2, 2)
124    section_names_index: Field = Field('e_shstrndx', 0x32, 0x3E, 2, 2)
125
126
127FILE_HEADER = _FileHeader()
128
129
130class _SectionHeader(NamedTuple):
131    """Fields in an ELF section header."""
132
133    section_name_offset: Field = Field('sh_name', 0x00, 0x00, 4, 4)
134    section_address: Field = Field('sh_addr', 0x0C, 0x10, 4, 8)
135    section_offset: Field = Field('sh_offset', 0x10, 0x18, 4, 8)
136    section_size: Field = Field('sh_size', 0x14, 0x20, 4, 8)
137
138    # section_header_end records the size of the header.
139    section_header_end: Field = Field('section end', 0x28, 0x40, 0, 0)
140
141
142SECTION_HEADER = _SectionHeader()
143
144
145def read_c_string(fd: BinaryIO) -> bytes:
146    """Reads a null-terminated string from the provided file descriptor."""
147    string = bytearray()
148    while True:
149        byte = fd.read(1)
150        if not byte or byte == b'\0':
151            return bytes(string)
152        string += byte
153
154
155def _bytes_match(fd: BinaryIO, expected: bytes) -> bool:
156    """Peeks at the next bytes to see if they match the expected."""
157    try:
158        offset = fd.tell()
159        data = fd.read(len(expected))
160        fd.seek(offset)
161        return data == expected
162    except IOError:
163        return False
164
165
166def compatible_file(file: BinaryIO | str | Path) -> bool:
167    """True if the file type is supported (ELF or archive)."""
168    try:
169        fd = open(file, 'rb') if isinstance(file, (str, Path)) else file
170
171        offset = fd.tell()
172        fd.seek(0)
173        result = _bytes_match(fd, ELF_MAGIC) or _bytes_match(fd, ARCHIVE_MAGIC)
174        fd.seek(offset)
175    finally:
176        if isinstance(file, (str, Path)):
177            fd.close()
178
179    return result
180
181
182class FileDecodeError(Exception):
183    """Invalid data was read from an ELF file."""
184
185
186class FieldReader:
187    """Reads ELF fields defined with a Field tuple from an ELF file."""
188
189    def __init__(self, elf: BinaryIO):
190        self._elf = elf
191        self.file_offset = self._elf.tell()
192
193        _check_next_bytes(self._elf, ELF_MAGIC, 'ELF file header')
194        size_field = self._elf.read(1)  # e_ident[EI_CLASS] (address size)
195
196        int_unpacker = self._determine_integer_format()
197
198        if size_field == b'\x01':
199            self.offset = lambda field: field.offset_32
200            self._size = lambda field: field.size_32
201            self._decode = lambda f, d: int_unpacker[f.size_32].unpack(d)[0]
202        elif size_field == b'\x02':
203            self.offset = lambda field: field.offset_64
204            self._size = lambda field: field.size_64
205            self._decode = lambda f, d: int_unpacker[f.size_64].unpack(d)[0]
206        else:
207            raise FileDecodeError('Unknown size {!r}'.format(size_field))
208
209    def _determine_integer_format(self) -> Mapping[int, struct.Struct]:
210        """Returns a dict of structs used for converting bytes to integers."""
211        endianness_byte = self._elf.read(1)  # e_ident[EI_DATA] (endianness)
212        if endianness_byte == b'\x01':
213            endianness = '<'
214        elif endianness_byte == b'\x02':
215            endianness = '>'
216        else:
217            raise FileDecodeError(
218                'Unknown endianness {!r}'.format(endianness_byte)
219            )
220
221        return {
222            1: struct.Struct(endianness + 'B'),
223            2: struct.Struct(endianness + 'H'),
224            4: struct.Struct(endianness + 'I'),
225            8: struct.Struct(endianness + 'Q'),
226        }
227
228    def read(self, field: Field, base: int = 0) -> int:
229        self._elf.seek(self.file_offset + base + self.offset(field))
230        data = self._elf.read(self._size(field))
231        return self._decode(field, data)
232
233    def read_string(self, offset: int) -> str:
234        self._elf.seek(self.file_offset + offset)
235        return read_c_string(self._elf).decode()
236
237
238class Elf:
239    """Represents an ELF file and the sections in it."""
240
241    class Section(NamedTuple):
242        """Info about a section in an ELF file."""
243
244        name: str
245        address: int
246        offset: int
247        size: int
248
249        file_offset: int  # Starting place in the file; 0 unless in an archive.
250
251        def range(self) -> range:
252            return range(self.address, self.address + self.size)
253
254        def __lt__(self, other) -> bool:
255            return self.address < other.address
256
257    def __init__(self, elf: BinaryIO):
258        self._elf = elf
259        self.sections: tuple[Elf.Section, ...] = tuple(self._list_sections())
260
261    def _list_sections(self) -> Iterable[Elf.Section]:
262        """Reads the section headers to enumerate all ELF sections."""
263        for _ in _elf_files_in_archive(self._elf):
264            reader = FieldReader(self._elf)
265            base = reader.read(FILE_HEADER.section_header_offset)
266            section_header_size = reader.offset(
267                SECTION_HEADER.section_header_end
268            )
269
270            # Find the section with the section names in it.
271            names_section_header_base = (
272                base
273                + section_header_size
274                * reader.read(FILE_HEADER.section_names_index)
275            )
276            names_table_base = reader.read(
277                SECTION_HEADER.section_offset, names_section_header_base
278            )
279
280            base = reader.read(FILE_HEADER.section_header_offset)
281            for _ in range(reader.read(FILE_HEADER.section_count)):
282                name_offset = reader.read(
283                    SECTION_HEADER.section_name_offset, base
284                )
285
286                yield self.Section(
287                    reader.read_string(names_table_base + name_offset),
288                    reader.read(SECTION_HEADER.section_address, base),
289                    reader.read(SECTION_HEADER.section_offset, base),
290                    reader.read(SECTION_HEADER.section_size, base),
291                    reader.file_offset,
292                )
293
294                base += section_header_size
295
296    def section_by_address(self, address: int) -> Elf.Section | None:
297        """Returns the section that contains the provided address, if any."""
298        # Iterate in reverse to give priority to sections with nonzero addresses
299        for section in sorted(self.sections, reverse=True):
300            if address in section.range():
301                return section
302
303        return None
304
305    def sections_with_name(self, name: str) -> Iterable[Elf.Section]:
306        for section in self.sections:
307            if section.name == name:
308                yield section
309
310    def read_value(
311        self, address: int, size: int | None = None
312    ) -> None | bytes | int:
313        """Reads specified bytes or null-terminated string at address."""
314        section = self.section_by_address(address)
315        if not section:
316            return None
317
318        assert section.address <= address
319        self._elf.seek(
320            section.file_offset + section.offset + address - section.address
321        )
322
323        if size is None:
324            return read_c_string(self._elf)
325
326        return self._elf.read(size)
327
328    def dump_sections(self, name: str | Pattern[str]) -> Mapping[str, bytes]:
329        """Returns a mapping of section names to section contents.
330
331        If processing an archive with multiple object files, the contents of
332        sections with duplicate names are concatenated in the order they appear
333        in the archive.
334        """
335        name_regex = re.compile(name)
336
337        sections: Mapping[str, bytearray] = collections.defaultdict(bytearray)
338        for section in self.sections:
339            if name_regex.match(section.name):
340                self._elf.seek(section.file_offset + section.offset)
341                sections[section.name].extend(self._elf.read(section.size))
342
343        return sections
344
345    def dump_section_contents(self, name: str | Pattern[str]) -> bytes | None:
346        """Dumps a binary string containing the sections matching the regex.
347
348        If processing an archive with multiple object files, the contents of
349        sections with duplicate names are concatenated in the order they appear
350        in the archive.
351        """
352        sections = self.dump_sections(name)
353        return b''.join(sections.values()) if sections else None
354
355    def summary(self) -> str:
356        return '\n'.join(
357            '[{0:2}] {1.address:08x} {1.offset:08x} {1.size:08x} '
358            '{1.name}'.format(i, section)
359            for i, section in enumerate(self.sections)
360        )
361
362    def __str__(self) -> str:
363        return 'Elf({}\n)'.format(
364            ''.join('\n  {},'.format(s) for s in self.sections)
365        )
366
367
368def _read_addresses(elf, size: int, output, address: Iterable[int]) -> None:
369    for addr in address:
370        value = elf.read_value(addr, size)
371
372        if value is None:
373            raise ValueError('Invalid address 0x{:08x}'.format(addr))
374
375        output(value)
376
377
378def _dump_sections(elf: Elf, output, sections: Iterable[Pattern[str]]) -> None:
379    if not sections:
380        output(elf.summary().encode())
381        return
382
383    for section_pattern in sections:
384        output(elf.dump_section_contents(section_pattern))
385
386
387def _parse_args() -> argparse.Namespace:
388    """Parses and returns command line arguments."""
389    parser = argparse.ArgumentParser(description=__doc__)
390
391    def hex_int(arg):
392        return int(arg, 16)
393
394    parser.add_argument(
395        '-e',
396        '--elf',
397        type=argparse.FileType('rb'),
398        help='the ELF file to examine',
399        required=True,
400    )
401
402    parser.add_argument(
403        '-d',
404        '--delimiter',
405        default=ord('\n'),
406        type=int,
407        help=r'delimiter to write after each value; \n by default',
408    )
409
410    parser.set_defaults(handler=lambda **_: parser.print_help())
411
412    subparsers = parser.add_subparsers(
413        help='select whether to work with addresses or whole sections'
414    )
415
416    section_parser = subparsers.add_parser('section')
417    section_parser.set_defaults(handler=_dump_sections)
418    section_parser.add_argument(
419        'sections',
420        metavar='section_regex',
421        nargs='*',
422        type=re.compile,  # type: ignore
423        help='section name regular expression',
424    )
425
426    address_parser = subparsers.add_parser('address')
427    address_parser.set_defaults(handler=_read_addresses)
428    address_parser.add_argument(
429        '--size',
430        type=int,
431        help='the size to read; reads until a null terminator by default',
432    )
433    address_parser.add_argument(
434        'address', nargs='+', type=hex_int, help='hexadecimal addresses to read'
435    )
436
437    return parser.parse_args()
438
439
440def _main(args):
441    """Calls the appropriate handler for the command line options."""
442    handler = args.handler
443    del args.handler
444
445    delim = args.delimiter
446    del args.delimiter
447
448    def output(value):
449        if value is not None:
450            sys.stdout.buffer.write(value)
451            sys.stdout.buffer.write(bytearray([delim]))
452            sys.stdout.flush()
453
454    args.output = output
455    args.elf = Elf(args.elf)
456
457    handler(**vars(args))
458
459
460if __name__ == '__main__':
461    _main(_parse_args())
462