xref: /aosp_15_r20/development/scripts/symbol.py (revision 90c8c64db3049935a07c6143d7fd006e26f8ecca)
1#!/usr/bin/env python3
2#
3# Copyright (C) 2013 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""Module for looking up symbolic debugging information.
18
19The information can include symbol names, offsets, and source locations.
20"""
21
22import atexit
23import json
24import glob
25import os
26import platform
27import re
28import shutil
29import signal
30import subprocess
31import unittest
32
33ANDROID_BUILD_TOP = os.environ.get("ANDROID_BUILD_TOP", ".")
34
35
36def FindClangDir():
37  get_clang_version = ANDROID_BUILD_TOP + "/build/soong/scripts/get_clang_version.py"
38  if os.path.exists(get_clang_version):
39    # We want the script to fail if get_clang_version.py exists but is unable
40    # to find the clang version.
41    version_output = subprocess.check_output(get_clang_version, text=True)
42    return ANDROID_BUILD_TOP + "/prebuilts/clang/host/linux-x86/" + version_output.strip()
43  else:
44    return None
45
46
47def FindSymbolsDir():
48  saveddir = os.getcwd()
49  os.chdir(ANDROID_BUILD_TOP)
50  stream = None
51  try:
52    cmd = "build/soong/soong_ui.bash --dumpvar-mode --abs TARGET_OUT_UNSTRIPPED"
53    stream = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True, shell=True).stdout
54    return str(stream.read().strip())
55  finally:
56    if stream is not None:
57        stream.close()
58    os.chdir(saveddir)
59
60SYMBOLS_DIR = FindSymbolsDir()
61
62ARCH_IS_32BIT = None
63
64VERBOSE = False
65
66# These are private. Do not access them from other modules.
67_CACHED_TOOLCHAIN = None
68_CACHED_CXX_FILT = None
69
70# Caches for symbolized information.
71_SYMBOL_INFORMATION_ADDR2LINE_CACHE = {}
72_SYMBOL_INFORMATION_OBJDUMP_CACHE = {}
73_SYMBOL_DEMANGLING_CACHE = {}
74
75# Caches for pipes to subprocesses.
76
77class ProcessCache:
78  _cmd2pipe = {}
79  _lru = []
80
81  # Max number of open pipes.
82  _PIPE_MAX_OPEN = 10
83
84  def GetProcess(self, cmd):
85    cmd_tuple = tuple(cmd)  # Need to use a tuple as lists can't be dict keys.
86    # Pipe already available?
87    if cmd_tuple in self._cmd2pipe:
88      pipe = self._cmd2pipe[cmd_tuple]
89      # Update LRU.
90      self._lru = [(cmd_tuple, pipe)] + [i for i in self._lru if i[0] != cmd_tuple]
91      return pipe
92
93    # Not cached, yet. Open a new one.
94
95    # Check if too many are open, close the old ones.
96    while len(self._lru) >= self._PIPE_MAX_OPEN:
97      open_cmd, open_pipe = self._lru.pop()
98      del self._cmd2pipe[open_cmd]
99      self.TerminateProcess(open_pipe)
100
101    # Create and put into cache.
102    pipe = self.SpawnProcess(cmd)
103    self._cmd2pipe[cmd_tuple] = pipe
104    self._lru = [(cmd_tuple, pipe)] + self._lru
105    return pipe
106
107  def SpawnProcess(self, cmd):
108     return subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True)
109
110  def TerminateProcess(self, pipe):
111    pipe.stdin.close()
112    pipe.stdout.close()
113    pipe.terminate()
114    pipe.wait()
115
116  def KillAllProcesses(self):
117    for _, open_pipe in self._lru:
118      self.TerminateProcess(open_pipe)
119    _cmd2pipe = {}
120    _lru = []
121
122
123_PIPE_ADDR2LINE_CACHE = ProcessCache()
124_PIPE_CPPFILT_CACHE = ProcessCache()
125
126
127# Process cache cleanup on shutdown.
128
129def CloseAllPipes():
130  _PIPE_ADDR2LINE_CACHE.KillAllProcesses()
131  _PIPE_CPPFILT_CACHE.KillAllProcesses()
132
133
134atexit.register(CloseAllPipes)
135
136
137def PipeTermHandler(signum, frame):
138  CloseAllPipes()
139  os._exit(0)
140
141
142for sig in (signal.SIGABRT, signal.SIGINT, signal.SIGTERM):
143  signal.signal(sig, PipeTermHandler)
144
145
146
147
148def ToolPath(tool, toolchain=None):
149  """Return a fully-qualified path to the specified tool, or just the tool if it's on PATH """
150  if shutil.which(tool):
151    return tool
152  if not toolchain:
153    toolchain = FindToolchain()
154  return os.path.join(toolchain, tool)
155
156
157def FindToolchain():
158  """Returns the toolchain."""
159
160  global _CACHED_TOOLCHAIN
161  if _CACHED_TOOLCHAIN:
162    return _CACHED_TOOLCHAIN
163
164  llvm_binutils_dir = ANDROID_BUILD_TOP + "/prebuilts/clang/host/linux-x86/llvm-binutils-stable/";
165  if not os.path.exists(llvm_binutils_dir):
166    raise Exception("Could not find llvm tool chain directory %s" % (llvm_binutils_dir))
167
168  _CACHED_TOOLCHAIN = llvm_binutils_dir
169  print("Using toolchain from:", _CACHED_TOOLCHAIN)
170  return _CACHED_TOOLCHAIN
171
172
173def SymbolInformation(lib, addr):
174  """Look up symbol information about an address.
175
176  Args:
177    lib: library (or executable) pathname containing symbols
178    addr: string hexidecimal address
179
180  Returns:
181    A list of the form [(source_symbol, source_location,
182    object_symbol_with_offset)].
183
184    If the function has been inlined then the list may contain
185    more than one element with the symbols for the most deeply
186    nested inlined location appearing first.  The list is
187    always non-empty, even if no information is available.
188
189    Usually you want to display the source_location and
190    object_symbol_with_offset from the last element in the list.
191  """
192  info = SymbolInformationForSet(lib, set([addr]))
193  return (info and info.get(addr)) or [(None, None, None)]
194
195
196def SymbolInformationForSet(lib, unique_addrs):
197  """Look up symbol information for a set of addresses from the given library.
198
199  Args:
200    lib: library (or executable) pathname containing symbols
201    unique_addrs: set of hexidecimal addresses
202
203  Returns:
204    A dictionary of the form {addr: [(source_symbol, source_location,
205    object_symbol_with_offset)]} where each address has a list of
206    associated symbols and locations.  The list is always non-empty.
207
208    If the function has been inlined then the list may contain
209    more than one element with the symbols for the most deeply
210    nested inlined location appearing first.  The list is
211    always non-empty, even if no information is available.
212
213    Usually you want to display the source_location and
214    object_symbol_with_offset from the last element in the list.
215  """
216  if not lib:
217    return None
218
219  addr_to_line = CallLlvmSymbolizerForSet(lib, unique_addrs)
220  if not addr_to_line:
221    return None
222
223  addr_to_objdump = CallObjdumpForSet(lib, unique_addrs)
224  if not addr_to_objdump:
225    return None
226
227  result = {}
228  for addr in unique_addrs:
229    source_info = addr_to_line.get(addr)
230    if not source_info:
231      source_info = [(None, None)]
232    if addr in addr_to_objdump:
233      (object_symbol, object_offset) = addr_to_objdump.get(addr)
234      object_symbol_with_offset = FormatSymbolWithOffset(object_symbol,
235                                                         object_offset)
236    else:
237      object_symbol_with_offset = None
238    result[addr] = [(source_symbol, source_location, object_symbol_with_offset)
239        for (source_symbol, source_location) in source_info]
240
241  return result
242
243
244def _OptionalStackRecordField(json_result, field):
245  """Fix up bizarre formatting of llvm-symbolizer output
246
247  Some parts of the FRAME output are output as a string containing a hex
248  integer, or the empty string when it's missing.
249
250  Args:
251    json_result: dictionary containing the Frame response
252    field: name of the field we want to read
253
254  Returns:
255    integer of field value, or None if missing
256  """
257  value = json_result.get(field, "")
258  if isinstance(value, int):
259    # Leaving this here in case someone decides to fix the types of the
260    # symbolizer output, so it's easier to roll out.
261    return value
262  if value != "":
263    return int(value, 16)
264  return None
265
266
267def _GetJSONSymbolizerForLib(lib, args=None):
268  """ Find symbol file for lib, and return a llvm-symbolizer instance for it.
269
270  Args:
271    lib: library (or executable) pathname containing symbols
272    args: (optional) list of arguments to pass to llvm-symbolizer
273
274  Returns:
275    child process, or None if lib not found
276  """
277  if args is None:
278    args = []
279  symbols = SYMBOLS_DIR + lib
280  if not os.path.exists(symbols):
281    symbols = lib
282    if not os.path.exists(symbols):
283      return None
284
285  # Make sure the symbols path is not a directory.
286  if os.path.isdir(symbols):
287    return None
288
289  cmd = [ToolPath("llvm-symbolizer"), "--output-style=JSON"] + args + ["--obj=" + symbols]
290  return _PIPE_ADDR2LINE_CACHE.GetProcess(cmd)
291
292
293def GetStackRecordsForSet(lib, unique_addrs):
294  """Look up stack record information for a set of addresses
295
296  Args:
297    lib: library (or executable) pathname containing symbols
298    unique_addrs: set of integer addresses look up.
299
300  Returns:
301    A list of tuples
302    (addr, function_name, local_name, file_line, frame_offset, size, tag_offset)
303    describing the local variables of the stack frame.
304    frame_offset, size, tag_offset may be None.
305  """
306  child = _GetJSONSymbolizerForLib(lib)
307  if child is None:
308    return None
309  records = []
310  for addr in unique_addrs:
311    child.stdin.write("FRAME 0x%x\n" % addr)
312    child.stdin.flush()
313    json_result = json.loads(child.stdout.readline().strip())
314    for frame in json_result["Frame"]:
315      records.append(
316        (addr,
317        frame["FunctionName"],
318        frame["Name"],
319        frame["DeclFile"] + ":" + str(frame["DeclLine"]),
320        frame.get("FrameOffset"),
321        _OptionalStackRecordField(frame, "Size"),
322        _OptionalStackRecordField(frame, "TagOffset")))
323  return records
324
325
326def CallLlvmSymbolizerForSet(lib, unique_addrs):
327  """Look up line and symbol information for a set of addresses.
328
329  Args:
330    lib: library (or executable) pathname containing symbols
331    unique_addrs: set of string hexidecimal addresses look up.
332
333  Returns:
334    A dictionary of the form {addr: [(symbol, file:line)]} where
335    each address has a list of associated symbols and locations
336    or an empty list if no symbol information was found.
337
338    If the function has been inlined then the list may contain
339    more than one element with the symbols for the most deeply
340    nested inlined location appearing first.
341  """
342  if not lib:
343    return None
344
345  result = {}
346  addrs = sorted(unique_addrs)
347
348  if lib in _SYMBOL_INFORMATION_ADDR2LINE_CACHE:
349    addr_cache = _SYMBOL_INFORMATION_ADDR2LINE_CACHE[lib]
350
351    # Go through and handle all known addresses.
352    for x in range(len(addrs)):
353      next_addr = addrs.pop(0)
354      if next_addr in addr_cache:
355        result[next_addr] = addr_cache[next_addr]
356      else:
357        # Re-add, needs to be symbolized.
358        addrs.append(next_addr)
359
360    if not addrs:
361      # Everything was cached, we're done.
362      return result
363  else:
364    addr_cache = {}
365    _SYMBOL_INFORMATION_ADDR2LINE_CACHE[lib] = addr_cache
366
367  child = _GetJSONSymbolizerForLib(
368    lib, ["--functions", "--inlines", "--demangle"])
369  if child is None:
370    return None
371  for addr in addrs:
372    try:
373      child.stdin.write("0x%s\n" % addr)
374      child.stdin.flush()
375      records = []
376      json_result = json.loads(child.stdout.readline().strip())
377      for symbol in json_result["Symbol"]:
378        function_name = symbol["FunctionName"]
379        # GNU style location: file_name:line_num
380        location = ("%s:%s" % (symbol["FileName"], symbol["Line"]))
381        records.append((function_name, location))
382    except IOError as e:
383      # Remove the / in front of the library name to match other output.
384      records = [(None, lib[1:] + "  ***Error: " + str(e))]
385    result[addr] = records
386    addr_cache[addr] = records
387  return result
388
389
390def CallObjdumpForSet(lib, unique_addrs):
391  """Use objdump to find out the names of the containing functions.
392
393  Args:
394    lib: library (or executable) pathname containing symbols
395    unique_addrs: set of string hexidecimal addresses to find the functions for.
396
397  Returns:
398    A dictionary of the form {addr: (string symbol, offset)}.
399  """
400  if not lib:
401    return None
402
403  result = {}
404  addrs = sorted(unique_addrs)
405
406  addr_cache = None
407  if lib in _SYMBOL_INFORMATION_OBJDUMP_CACHE:
408    addr_cache = _SYMBOL_INFORMATION_OBJDUMP_CACHE[lib]
409
410    # Go through and handle all known addresses.
411    for x in range(len(addrs)):
412      next_addr = addrs.pop(0)
413      if next_addr in addr_cache:
414        result[next_addr] = addr_cache[next_addr]
415      else:
416        # Re-add, needs to be symbolized.
417        addrs.append(next_addr)
418
419    if not addrs:
420      # Everything was cached, we're done.
421      return result
422  else:
423    addr_cache = {}
424    _SYMBOL_INFORMATION_OBJDUMP_CACHE[lib] = addr_cache
425
426  symbols = SYMBOLS_DIR + lib
427  if not os.path.exists(symbols):
428    symbols = lib
429    if not os.path.exists(symbols):
430      return None
431
432  start_addr_dec = str(int(addrs[0], 16))
433  stop_addr_dec = str(int(addrs[-1], 16) + 8)
434  cmd = [ToolPath("llvm-objdump"),
435         "--section=.text",
436         "--demangle",
437         "--disassemble",
438         "--start-address=" + start_addr_dec,
439         "--stop-address=" + stop_addr_dec,
440         symbols]
441
442  # Function lines look like:
443  #   000177b0 <android::IBinder::~IBinder()+0x2c>:
444  # We pull out the address and function first. Then we check for an optional
445  # offset. This is tricky due to functions that look like "operator+(..)+0x2c"
446  func_regexp = re.compile("(^[a-f0-9]*) \<(.*)\>:$")
447  offset_regexp = re.compile("(.*)\+0x([a-f0-9]*)")
448
449  # A disassembly line looks like:
450  #   177b2:	b510      	push	{r4, lr}
451  asm_regexp = re.compile("(^[ a-f0-9]*):[ a-f0-0]*.*$")
452
453  current_symbol = None    # The current function symbol in the disassembly.
454  current_symbol_addr = 0  # The address of the current function.
455  addr_index = 0  # The address that we are currently looking for.
456
457  stream = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True).stdout
458  for line in stream:
459    # Is it a function line like:
460    #   000177b0 <android::IBinder::~IBinder()>:
461    components = func_regexp.match(line)
462    if components:
463      # This is a new function, so record the current function and its address.
464      current_symbol_addr = int(components.group(1), 16)
465      current_symbol = components.group(2)
466
467      # Does it have an optional offset like: "foo(..)+0x2c"?
468      components = offset_regexp.match(current_symbol)
469      if components:
470        current_symbol = components.group(1)
471        offset = components.group(2)
472        if offset:
473          current_symbol_addr -= int(offset, 16)
474
475    # Is it an disassembly line like:
476    #   177b2:	b510      	push	{r4, lr}
477    components = asm_regexp.match(line)
478    if components:
479      addr = components.group(1)
480      target_addr = addrs[addr_index]
481      i_addr = int(addr, 16)
482      i_target = int(target_addr, 16)
483      if i_addr == i_target:
484        result[target_addr] = (current_symbol, i_target - current_symbol_addr)
485        addr_cache[target_addr] = result[target_addr]
486        addr_index += 1
487        if addr_index >= len(addrs):
488          break
489  stream.close()
490
491  return result
492
493
494def CallCppFilt(mangled_symbol):
495  if mangled_symbol in _SYMBOL_DEMANGLING_CACHE:
496    return _SYMBOL_DEMANGLING_CACHE[mangled_symbol]
497
498  global _CACHED_CXX_FILT
499  if not _CACHED_CXX_FILT:
500    toolchains = None
501    clang_dir = FindClangDir()
502    if clang_dir:
503      if os.path.exists(clang_dir + "/bin/llvm-cxxfilt"):
504        toolchains = [clang_dir + "/bin/llvm-cxxfilt"]
505      else:
506        raise Exception("bin/llvm-cxxfilt missing from " + clang_dir)
507    else:
508      # When run in CI, we don't have a way to find the clang version.  But
509      # llvm-cxxfilt should be available in the following relative path.
510      toolchains = glob.glob("./clang-r*/bin/llvm-cxxfilt")
511      if toolchains and len(toolchains) != 1:
512        raise Exception("Expected one llvm-cxxfilt but found many: " + \
513                        ", ".join(toolchains))
514    if not toolchains:
515      raise Exception("Could not find llvm-cxxfilt tool")
516    _CACHED_CXX_FILT = sorted(toolchains)[-1]
517
518  cmd = [_CACHED_CXX_FILT]
519  process = _PIPE_CPPFILT_CACHE.GetProcess(cmd)
520  process.stdin.write(mangled_symbol)
521  process.stdin.write("\n")
522  process.stdin.flush()
523
524  demangled_symbol = process.stdout.readline().strip()
525
526  _SYMBOL_DEMANGLING_CACHE[mangled_symbol] = demangled_symbol
527
528  return demangled_symbol
529
530
531def FormatSymbolWithOffset(symbol, offset):
532  if offset == 0:
533    return symbol
534  return "%s+%d" % (symbol, offset)
535
536def FormatSymbolWithoutParameters(symbol):
537  """Remove parameters from function.
538
539  Rather than trying to parse the demangled C++ signature,
540  it just removes matching top level parenthesis.
541  """
542  if not symbol:
543    return symbol
544
545  result = symbol
546  result = result.replace(") const", ")")                  # Strip const keyword.
547  result = result.replace("operator<<", "operator\u00AB")  # Avoid unmatched '<'.
548  result = result.replace("operator>>", "operator\u00BB")  # Avoid unmatched '>'.
549  result = result.replace("operator->", "operator\u2192")  # Avoid unmatched '>'.
550
551  nested = []  # Keeps tract of current nesting level of parenthesis.
552  for i in reversed(range(len(result))):  # Iterate backward to make cutting easier.
553    c = result[i]
554    if c == ')' or c == '>':
555      if len(nested) == 0:
556        end = i + 1  # Mark the end of top-level pair.
557      nested.append(c)
558    if c == '(' or c == '<':
559      if len(nested) == 0 or {')':'(', '>':'<'}[nested.pop()] != c:
560        return symbol  # Malformed: character does not match its pair.
561      if len(nested) == 0 and c == '(' and (end - i) > 2:
562        result = result[:i] + result[end:]  # Remove substring (i, end).
563  if len(nested) > 0:
564    return symbol  # Malformed: missing pair.
565
566  return result.strip()
567
568def SetBitness(lines):
569  global ARCH_IS_32BIT
570
571  trace_line = re.compile("\#[0-9]+[ \t]+..[ \t]+([0-9a-f]{8}|[0-9a-f]{16})([ \t]+|$)")
572  asan_trace_line = re.compile("\#[0-9]+[ \t]+0x([0-9a-f]+)[ \t]+")
573
574  ARCH_IS_32BIT = False
575  for line in lines:
576    trace_match = trace_line.search(line)
577    if trace_match:
578      # Try to guess the arch, we know the bitness.
579      if len(trace_match.group(1)) == 16:
580        ARCH_IS_32BIT = False
581      else:
582        ARCH_IS_32BIT = True
583      break
584    asan_trace_match = asan_trace_line.search(line)
585    if asan_trace_match:
586      # We might be able to guess the bitness by the length of the address.
587      if len(asan_trace_match.group(1)) > 8:
588        ARCH_IS_32BIT = False
589        # We know for a fact this is 64 bit, so we are done.
590        break
591      else:
592        # This might be 32 bit, or just a small address. Keep going in this
593        # case, but if we couldn't figure anything else out, go with 32 bit.
594        ARCH_IS_32BIT = True
595
596class FindClangDirTests(unittest.TestCase):
597  @unittest.skipIf(ANDROID_BUILD_TOP == '.', 'Test only supported in an Android tree.')
598  def test_clang_dir_found(self):
599    self.assertIsNotNone(FindClangDir())
600
601class SetBitnessTests(unittest.TestCase):
602  def test_32bit_check(self):
603    global ARCH_IS_32BIT
604
605    SetBitness(["#00 pc 000374e0"])
606    self.assertTrue(ARCH_IS_32BIT)
607
608  def test_64bit_check(self):
609    global ARCH_IS_32BIT
610
611    SetBitness(["#00 pc 00000000000374e0"])
612    self.assertFalse(ARCH_IS_32BIT)
613
614  def test_32bit_asan_trace_line_toolchain(self):
615    global ARCH_IS_32BIT
616
617    SetBitness(["#10 0xb5eeba5d  (/system/vendor/lib/egl/libGLESv1_CM_adreno.so+0xfa5d)"])
618    self.assertTrue(ARCH_IS_32BIT)
619
620  def test_64bit_asan_trace_line_toolchain(self):
621    global ARCH_IS_32BIT
622
623    SetBitness(["#12 0x5d33bf  (/system/lib/libclang_rt.asan-arm-android.so+0x823bf)",
624                "#12 0x11b35d33bf  (/system/lib/libclang_rt.asan-arm-android.so+0x823bf)"])
625    self.assertFalse(ARCH_IS_32BIT)
626
627class FormatSymbolWithoutParametersTests(unittest.TestCase):
628  def test_c(self):
629    self.assertEqual(FormatSymbolWithoutParameters("foo"), "foo")
630    self.assertEqual(FormatSymbolWithoutParameters("foo+42"), "foo+42")
631
632  def test_simple(self):
633    self.assertEqual(FormatSymbolWithoutParameters("foo(int i)"), "foo")
634    self.assertEqual(FormatSymbolWithoutParameters("foo(int i)+42"), "foo+42")
635    self.assertEqual(FormatSymbolWithoutParameters("bar::foo(int i)+42"), "bar::foo+42")
636    self.assertEqual(FormatSymbolWithoutParameters("operator()"), "operator()")
637
638  def test_templates(self):
639    self.assertEqual(FormatSymbolWithoutParameters("bar::foo<T>(vector<T>& v)"), "bar::foo<T>")
640    self.assertEqual(FormatSymbolWithoutParameters("bar<T>::foo(vector<T>& v)"), "bar<T>::foo")
641    self.assertEqual(FormatSymbolWithoutParameters("bar::foo<T>(vector<T<U>>& v)"), "bar::foo<T>")
642    self.assertEqual(FormatSymbolWithoutParameters("bar::foo<(EnumType)0>(vector<(EnumType)0>& v)"),
643                                                   "bar::foo<(EnumType)0>")
644
645  def test_nested(self):
646    self.assertEqual(FormatSymbolWithoutParameters("foo(int i)::bar(int j)"), "foo::bar")
647
648  def test_unbalanced(self):
649    self.assertEqual(FormatSymbolWithoutParameters("foo(bar(int i)"), "foo(bar(int i)")
650    self.assertEqual(FormatSymbolWithoutParameters("foo)bar(int i)"), "foo)bar(int i)")
651    self.assertEqual(FormatSymbolWithoutParameters("foo<bar(int i)"), "foo<bar(int i)")
652    self.assertEqual(FormatSymbolWithoutParameters("foo>bar(int i)"), "foo>bar(int i)")
653
654if __name__ == '__main__':
655    unittest.main(verbosity=2)
656