xref: /aosp_15_r20/external/angle/build/android/dump_apk_resource_strings.py (revision 8975f5c5ed3d1c378011245431ada316dfb6f244)
1#!/usr/bin/env vpython3
2# encoding: utf-8
3# Copyright 2019 The Chromium Authors
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""A script to parse and dump localized strings in resource.arsc files."""
8
9
10import argparse
11import collections
12import contextlib
13import cProfile
14import os
15import re
16import subprocess
17import sys
18import zipfile
19
20# pylint: disable=bare-except
21
22# Assuming this script is located under build/android, try to import
23# build/android/gyp/bundletool.py to get the default path to the bundletool
24# jar file. If this fail, using --bundletool-path will be required to parse
25# bundles, allowing this script to be relocated or reused somewhere else.
26try:
27  sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'gyp'))
28  import bundletool
29
30  _DEFAULT_BUNDLETOOL_PATH = bundletool.BUNDLETOOL_JAR_PATH
31except:
32  _DEFAULT_BUNDLETOOL_PATH = None
33
34# Try to get the path of the aapt build tool from catapult/devil.
35try:
36  import devil_chromium  # pylint: disable=unused-import
37  from devil.android.sdk import build_tools
38  _AAPT_DEFAULT_PATH = build_tools.GetPath('aapt')
39except:
40  _AAPT_DEFAULT_PATH = None
41
42
43def AutoIndentStringList(lines, indentation=2):
44  """Auto-indents a input list of text lines, based on open/closed braces.
45
46  For example, the following input text:
47
48    'Foo {',
49    'Bar {',
50    'Zoo',
51    '}',
52    '}',
53
54  Will return the following:
55
56    'Foo {',
57    '  Bar {',
58    '    Zoo',
59    '  }',
60    '}',
61
62  The rules are pretty simple:
63    - A line that ends with an open brace ({) increments indentation.
64    - A line that starts with a closing brace (}) decrements it.
65
66  The main idea is to make outputting structured text data trivial,
67  since it can be assumed that the final output will be passed through
68  this function to make it human-readable.
69
70  Args:
71    lines: an iterator over input text lines. They should not contain
72      line terminator (e.g. '\n').
73  Returns:
74    A new list of text lines, properly auto-indented.
75  """
76  margin = ''
77  result = []
78  # NOTE: Intentional but significant speed optimizations in this function:
79  #   - |line and line[0] == <char>| instead of |line.startswith(<char>)|.
80  #   - |line and line[-1] == <char>| instead of |line.endswith(<char>)|.
81  for line in lines:
82    if line and line[0] == '}':
83      margin = margin[:-indentation]
84    result.append(margin + line)
85    if line and line[-1] == '{':
86      margin += ' ' * indentation
87
88  return result
89
90
91# pylint: disable=line-too-long
92
93# NOTE: aapt dump will quote the following characters only: \n, \ and "
94# see https://cs.android.com/search?q=f:ResourceTypes.cpp
95
96# pylint: enable=line-too-long
97
98
99def UnquoteString(s):
100  """Unquote a given string from aapt dump.
101
102  Args:
103    s: An UTF-8 encoded string that contains backslashes for quotes, as found
104      in the output of 'aapt dump resources --values'.
105  Returns:
106    The unquoted version of the input string.
107  """
108  if not '\\' in s:
109    return s
110
111  result = ''
112  start = 0
113  size = len(s)
114  while start < size:
115    pos = s.find('\\', start)
116    if pos < 0:
117      break
118
119    result += s[start:pos]
120    count = 1
121    while pos + count < size and s[pos + count] == '\\':
122      count += 1
123
124    result += '\\' * (count // 2)
125    start = pos + count
126    if count & 1:
127      if start < size:
128        ch = s[start]
129        if ch == 'n':  # \n is the only non-printable character supported.
130          ch = '\n'
131        result += ch
132        start += 1
133      else:
134        result += '\\'
135
136  result += s[start:]
137  return result
138
139
140assert UnquoteString(r'foo bar') == 'foo bar'
141assert UnquoteString(r'foo\nbar') == 'foo\nbar'
142assert UnquoteString(r'foo\\nbar') == 'foo\\nbar'
143assert UnquoteString(r'foo\\\nbar') == 'foo\\\nbar'
144assert UnquoteString(r'foo\n\nbar') == 'foo\n\nbar'
145assert UnquoteString(r'foo\\bar') == r'foo\bar'
146
147
148def QuoteString(s):
149  """Quote a given string for external output.
150
151  Args:
152    s: An input UTF-8 encoded string.
153  Returns:
154    A quoted version of the string, using the same rules as 'aapt dump'.
155  """
156  # NOTE: Using repr() would escape all non-ASCII bytes in the string, which
157  # is undesirable.
158  return s.replace('\\', r'\\').replace('"', '\\"').replace('\n', '\\n')
159
160
161assert QuoteString(r'foo "bar"') == 'foo \\"bar\\"'
162assert QuoteString('foo\nbar') == 'foo\\nbar'
163
164
165def ReadStringMapFromRTxt(r_txt_path):
166  """Read all string resource IDs and names from an R.txt file.
167
168  Args:
169    r_txt_path: Input file path.
170  Returns:
171    A {res_id -> res_name} dictionary corresponding to the string resources
172    from the input R.txt file.
173  """
174  # NOTE: Typical line of interest looks like:
175  # int string AllowedDomainsForAppsTitle 0x7f130001
176  result = {}
177  prefix = 'int string '
178  with open(r_txt_path) as f:
179    for line in f:
180      line = line.rstrip()
181      if line.startswith(prefix):
182        res_name, res_id = line[len(prefix):].split(' ')
183        result[int(res_id, 0)] = res_name
184  return result
185
186
187class ResourceStringValues:
188  """Models all possible values for a named string."""
189
190  def __init__(self):
191    self.res_name = None
192    self.res_values = {}
193
194  def AddValue(self, res_name, res_config, res_value):
195    """Add a new value to this entry.
196
197    Args:
198      res_name: Resource name. If this is not the first time this method
199        is called with the same resource name, then |res_name| should match
200        previous parameters for sanity checking.
201      res_config: Config associated with this value. This can actually be
202        anything that can be converted to a string.
203      res_value: UTF-8 encoded string value.
204    """
205    if res_name is not self.res_name and res_name != self.res_name:
206      if self.res_name is None:
207        self.res_name = res_name
208      else:
209        # Sanity check: the resource name should be the same for all chunks.
210        # Resource ID is redefined with a different name!!
211        print('WARNING: Resource key ignored (%s, should be %s)' %
212              (res_name, self.res_name))
213
214    if self.res_values.setdefault(res_config, res_value) is not res_value:
215      print('WARNING: Duplicate value definition for [config %s]: %s ' \
216            '(already has %s)' % (
217                res_config, res_value, self.res_values[res_config]))
218
219  def ToStringList(self, res_id):
220    """Convert entry to string list for human-friendly output."""
221    values = sorted([(str(config), value)
222                     for config, value in self.res_values.items()])
223    if res_id is None:
224      # res_id will be None when the resource ID should not be part
225      # of the output.
226      result = ['name=%s count=%d {' % (self.res_name, len(values))]
227    else:
228      result = [
229          'res_id=0x%08x name=%s count=%d {' % (res_id, self.res_name,
230                                                len(values))
231      ]
232    for config, value in values:
233      result.append('%-16s "%s"' % (config, QuoteString(value)))
234    result.append('}')
235    return result
236
237
238class ResourceStringMap:
239  """Convenience class to hold the set of all localized strings in a table.
240
241  Usage is the following:
242     1) Create new (empty) instance.
243     2) Call AddValue() repeatedly to add new values.
244     3) Eventually call RemapResourceNames() to remap resource names.
245     4) Call ToStringList() to convert the instance to a human-readable
246        list of strings that can later be used with AutoIndentStringList()
247        for example.
248  """
249
250  def __init__(self):
251    self._res_map = collections.defaultdict(ResourceStringValues)
252
253  def AddValue(self, res_id, res_name, res_config, res_value):
254    self._res_map[res_id].AddValue(res_name, res_config, res_value)
255
256  def RemapResourceNames(self, id_name_map):
257    """Rename all entries according to a given {res_id -> res_name} map."""
258    for res_id, res_name in id_name_map.items():
259      if res_id in self._res_map:
260        self._res_map[res_id].res_name = res_name
261
262  def ToStringList(self, omit_ids=False):
263    """Dump content to a human-readable string list.
264
265    Note that the strings are ordered by their resource name first, and
266    resource id second.
267
268    Args:
269      omit_ids: If True, do not put resource IDs in the result. This might
270        be useful when comparing the outputs of two different builds of the
271        same APK, or two related APKs (e.g. ChromePublic.apk vs Chrome.apk)
272        where the resource IDs might be slightly different, but not the
273        string contents.
274    Return:
275      A list of strings that can later be sent to AutoIndentStringList().
276    """
277    result = ['Resource strings (count=%d) {' % len(self._res_map)]
278    res_map = self._res_map
279
280    # Compare two (res_id, values) tuples by resource name first, then resource
281    # ID.
282    for res_id, _ in sorted(res_map.items(),
283                            key=lambda x: (x[1].res_name, x[0])):
284      result += res_map[res_id].ToStringList(None if omit_ids else res_id)
285    result.append('}  # Resource strings')
286    return result
287
288
289@contextlib.contextmanager
290def ManagedOutput(output_file):
291  """Create an output File object that will be closed on exit if necessary.
292
293  Args:
294    output_file: Optional output file path.
295  Yields:
296    If |output_file| is empty, this simply yields sys.stdout. Otherwise, this
297    opens the file path for writing text, and yields its File object. The
298    context will ensure that the object is always closed on scope exit.
299  """
300  close_output = False
301  if output_file:
302    output = open(output_file, 'wt')
303    close_output = True
304  else:
305    output = sys.stdout
306  try:
307    yield output
308  finally:
309    if close_output:
310      output.close()
311
312
313@contextlib.contextmanager
314def ManagedPythonProfiling(enable_profiling, sort_key='tottime'):
315  """Enable Python profiling if needed.
316
317  Args:
318    enable_profiling: Boolean flag. True to enable python profiling.
319    sort_key: Sorting key for the final stats dump.
320  Yields:
321    If |enable_profiling| is False, this yields False. Otherwise, this
322    yields a new Profile instance just after enabling it. The manager
323    ensures that profiling stops and prints statistics on scope exit.
324  """
325  pr = None
326  if enable_profiling:
327    pr = cProfile.Profile()
328    pr.enable()
329  try:
330    yield pr
331  finally:
332    if pr:
333      pr.disable()
334      pr.print_stats(sort=sort_key)
335
336
337def IsFilePathABundle(input_file):
338  """Return True iff |input_file| holds an Android app bundle."""
339  try:
340    with zipfile.ZipFile(input_file) as input_zip:
341      _ = input_zip.getinfo('BundleConfig.pb')
342      return True
343  except:
344    return False
345
346
347# Example output from 'bundletool dump resources --values' corresponding
348# to strings:
349#
350# 0x7F1200A0 - string/abc_action_menu_overflow_description
351#         (default) - [STR] "More options"
352#         locale: "ca" - [STR] "Més opcions"
353#         locale: "da" - [STR] "Flere muligheder"
354#         locale: "fa" - [STR] " گزینه<U+200C>های بیشتر"
355#         locale: "ja" - [STR] "その他のオプション"
356#         locale: "ta" - [STR] "மேலும் விருப்பங்கள்"
357#         locale: "nb" - [STR] "Flere alternativer"
358#         ...
359#
360# Fun fact #1: Bundletool uses <lang>-<REGION> instead of <lang>-r<REGION>
361#              for locales!
362#
363# Fun fact #2: The <U+200C> is terminal output for \u200c, the output is
364#              really UTF-8 encoded when it is read by this script.
365#
366# Fun fact #3: Bundletool quotes \n, \\ and \" just like aapt since 0.8.0.
367#
368_RE_BUNDLE_STRING_RESOURCE_HEADER = re.compile(
369    r'^0x([0-9A-F]+)\s\-\sstring/(\w+)$')
370assert _RE_BUNDLE_STRING_RESOURCE_HEADER.match(
371    '0x7F1200A0 - string/abc_action_menu_overflow_description')
372
373_RE_BUNDLE_STRING_DEFAULT_VALUE = re.compile(
374    r'^\s+\(default\) - \[STR\] "(.*)"$')
375assert _RE_BUNDLE_STRING_DEFAULT_VALUE.match(
376    '        (default) - [STR] "More options"')
377assert _RE_BUNDLE_STRING_DEFAULT_VALUE.match(
378    '        (default) - [STR] "More options"').group(1) == "More options"
379
380_RE_BUNDLE_STRING_LOCALIZED_VALUE = re.compile(
381    r'^\s+locale: "([0-9a-zA-Z-]+)" - \[STR\] "(.*)"$')
382assert _RE_BUNDLE_STRING_LOCALIZED_VALUE.match(
383    '        locale: "ar" - [STR] "گزینه\u200cهای بیشتر"')
384
385
386def ParseBundleResources(bundle_tool_jar_path, bundle_path):
387  """Use bundletool to extract the localized strings of a given bundle.
388
389  Args:
390    bundle_tool_jar_path: Path to bundletool .jar executable.
391    bundle_path: Path to input bundle.
392  Returns:
393    A new ResourceStringMap instance populated with the bundle's content.
394  """
395  cmd_args = [
396      'java', '-jar', bundle_tool_jar_path, 'dump', 'resources', '--bundle',
397      bundle_path, '--values'
398  ]
399  p = subprocess.Popen(cmd_args, bufsize=1, stdout=subprocess.PIPE)
400  res_map = ResourceStringMap()
401  current_resource_id = None
402  current_resource_name = None
403  keep_parsing = True
404  need_value = False
405  while keep_parsing:
406    line = p.stdout.readline()
407    if not line:
408      break
409    # Do not use rstrip(), since this should only remove trailing newlines
410    # but not trailing whitespace that happen to be embedded in the string
411    # value for some reason.
412    line = line.rstrip('\n\r')
413    m = _RE_BUNDLE_STRING_RESOURCE_HEADER.match(line)
414    if m:
415      current_resource_id = int(m.group(1), 16)
416      current_resource_name = m.group(2)
417      need_value = True
418      continue
419
420    if not need_value:
421      continue
422
423    resource_config = None
424    m = _RE_BUNDLE_STRING_DEFAULT_VALUE.match(line)
425    if m:
426      resource_config = 'config (default)'
427      resource_value = m.group(1)
428    else:
429      m = _RE_BUNDLE_STRING_LOCALIZED_VALUE.match(line)
430      if m:
431        resource_config = 'config %s' % m.group(1)
432        resource_value = m.group(2)
433
434    if resource_config is None:
435      need_value = False
436      continue
437
438    res_map.AddValue(current_resource_id, current_resource_name,
439                     resource_config, UnquoteString(resource_value))
440  return res_map
441
442
443# Name of the binary resources table file inside an APK.
444RESOURCES_FILENAME = 'resources.arsc'
445
446
447def IsFilePathAnApk(input_file):
448  """Returns True iff a ZipFile instance is for a regular APK."""
449  try:
450    with zipfile.ZipFile(input_file) as input_zip:
451      _ = input_zip.getinfo(RESOURCES_FILENAME)
452      return True
453  except:
454    return False
455
456
457# pylint: disable=line-too-long
458
459# Example output from 'aapt dump resources --values' corresponding
460# to strings:
461#
462#      config zh-rHK
463#        resource 0x7f12009c org.chromium.chrome:string/0_resource_name_obfuscated: t=0x03 d=0x0000caa9 (s=0x0008 r=0x00)
464#          (string8) "瀏覽首頁"
465#        resource 0x7f12009d org.chromium.chrome:string/0_resource_name_obfuscated: t=0x03 d=0x0000c8e0 (s=0x0008 r=0x00)
466#          (string8) "向上瀏覽"
467#
468
469# The following are compiled regular expressions used to recognize each
470# of line and extract relevant information.
471#
472_RE_AAPT_CONFIG = re.compile(r'^\s+config (.+):$')
473assert _RE_AAPT_CONFIG.match('   config (default):')
474assert _RE_AAPT_CONFIG.match('   config zh-rTW:')
475
476# Match an ISO 639-1 or ISO 639-2 locale.
477_RE_AAPT_ISO_639_LOCALE = re.compile(r'^[a-z]{2,3}(-r[A-Z]{2,3})?$')
478assert _RE_AAPT_ISO_639_LOCALE.match('de')
479assert _RE_AAPT_ISO_639_LOCALE.match('zh-rTW')
480assert _RE_AAPT_ISO_639_LOCALE.match('fil')
481assert not _RE_AAPT_ISO_639_LOCALE.match('land')
482
483_RE_AAPT_BCP47_LOCALE = re.compile(r'^b\+[a-z][a-zA-Z0-9\+]+$')
484assert _RE_AAPT_BCP47_LOCALE.match('b+sr')
485assert _RE_AAPT_BCP47_LOCALE.match('b+sr+Latn')
486assert _RE_AAPT_BCP47_LOCALE.match('b+en+US')
487assert not _RE_AAPT_BCP47_LOCALE.match('b+')
488assert not _RE_AAPT_BCP47_LOCALE.match('b+1234')
489
490_RE_AAPT_STRING_RESOURCE_HEADER = re.compile(
491    r'^\s+resource 0x([0-9a-f]+) [a-zA-Z][a-zA-Z0-9.]+:string/(\w+):.*$')
492assert _RE_AAPT_STRING_RESOURCE_HEADER.match(
493    r'  resource 0x7f12009c org.chromium.chrome:string/0_resource_name_obfuscated: t=0x03 d=0x0000caa9 (s=0x0008 r=0x00)'
494)
495
496_RE_AAPT_STRING_RESOURCE_VALUE = re.compile(r'^\s+\(string8\) "(.*)"$')
497assert _RE_AAPT_STRING_RESOURCE_VALUE.match(r'       (string8) "瀏覽首頁"')
498
499# pylint: enable=line-too-long
500
501
502def _ConvertAaptLocaleToBcp47(locale):
503  """Convert a locale name from 'aapt dump' to its BCP-47 form."""
504  if locale.startswith('b+'):
505    return '-'.join(locale[2:].split('+'))
506  lang, _, region = locale.partition('-r')
507  if region:
508    return '%s-%s' % (lang, region)
509  return lang
510
511
512assert _ConvertAaptLocaleToBcp47('(default)') == '(default)'
513assert _ConvertAaptLocaleToBcp47('en') == 'en'
514assert _ConvertAaptLocaleToBcp47('en-rUS') == 'en-US'
515assert _ConvertAaptLocaleToBcp47('en-US') == 'en-US'
516assert _ConvertAaptLocaleToBcp47('fil') == 'fil'
517assert _ConvertAaptLocaleToBcp47('b+sr+Latn') == 'sr-Latn'
518
519
520def ParseApkResources(aapt_path, apk_path):
521  """Use aapt to extract the localized strings of a given bundle.
522
523  Args:
524    bundle_tool_jar_path: Path to bundletool .jar executable.
525    bundle_path: Path to input bundle.
526  Returns:
527    A new ResourceStringMap instance populated with the bundle's content.
528  """
529  cmd_args = [aapt_path, 'dump', '--values', 'resources', apk_path]
530  p = subprocess.Popen(cmd_args, bufsize=1, stdout=subprocess.PIPE)
531
532  res_map = ResourceStringMap()
533  current_locale = None
534  current_resource_id = -1  # represents undefined.
535  current_resource_name = None
536  need_value = False
537  while True:
538    try:
539      line = p.stdout.readline().rstrip().decode('utf8')
540    except UnicodeDecodeError:
541      continue
542
543    if not line:
544      break
545    m = _RE_AAPT_CONFIG.match(line)
546    if m:
547      locale = None
548      aapt_locale = m.group(1)
549      if aapt_locale == '(default)':
550        locale = aapt_locale
551      elif _RE_AAPT_ISO_639_LOCALE.match(aapt_locale):
552        locale = aapt_locale
553      elif _RE_AAPT_BCP47_LOCALE.match(aapt_locale):
554        locale = aapt_locale
555      if locale is not None:
556        current_locale = _ConvertAaptLocaleToBcp47(locale)
557      continue
558
559    if current_locale is None:
560      continue
561
562    if need_value:
563      m = _RE_AAPT_STRING_RESOURCE_VALUE.match(line)
564      if not m:
565        # Should not happen
566        sys.stderr.write('WARNING: Missing value for string ID 0x%08x "%s"' %
567                         (current_resource_id, current_resource_name))
568        resource_value = '<MISSING_STRING_%08x>' % current_resource_id
569      else:
570        resource_value = UnquoteString(m.group(1))
571
572      res_map.AddValue(current_resource_id, current_resource_name,
573                       'config %s' % current_locale, resource_value)
574      need_value = False
575    else:
576      m = _RE_AAPT_STRING_RESOURCE_HEADER.match(line)
577      if m:
578        current_resource_id = int(m.group(1), 16)
579        current_resource_name = m.group(2)
580        need_value = True
581
582  return res_map
583
584
585def main(args):
586  parser = argparse.ArgumentParser(
587      description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
588  parser.add_argument(
589      'input_file',
590      help='Input file path. This can be either an APK, or an app bundle.')
591  parser.add_argument('--output', help='Optional output file path.')
592  parser.add_argument(
593      '--omit-ids',
594      action='store_true',
595      help='Omit resource IDs in the output. This is useful '
596      'to compare the contents of two distinct builds of the '
597      'same APK.')
598  parser.add_argument(
599      '--aapt-path',
600      default=_AAPT_DEFAULT_PATH,
601      help='Path to aapt executable. Optional for APKs.')
602  parser.add_argument(
603      '--r-txt-path',
604      help='Path to an optional input R.txt file used to translate resource '
605      'IDs to string names. Useful when resources names in the input files '
606      'were obfuscated. NOTE: If ${INPUT_FILE}.R.txt exists, if will be used '
607      'automatically by this script.')
608  parser.add_argument(
609      '--bundletool-path',
610      default=_DEFAULT_BUNDLETOOL_PATH,
611      help='Path to alternate bundletool .jar file. Only used for bundles.')
612  parser.add_argument(
613      '--profile', action='store_true', help='Enable Python profiling.')
614
615  options = parser.parse_args(args)
616
617  # Create a {res_id -> res_name} map for unobfuscation, if needed.
618  res_id_name_map = {}
619  r_txt_path = options.r_txt_path
620  if not r_txt_path:
621    candidate_r_txt_path = options.input_file + '.R.txt'
622    if os.path.exists(candidate_r_txt_path):
623      r_txt_path = candidate_r_txt_path
624
625  if r_txt_path:
626    res_id_name_map = ReadStringMapFromRTxt(r_txt_path)
627
628  # Create a helper lambda that creates a new ResourceStringMap instance
629  # based on the input file's type.
630  if IsFilePathABundle(options.input_file):
631    if not options.bundletool_path:
632      parser.error(
633          '--bundletool-path <BUNDLETOOL_JAR> is required to parse bundles.')
634
635    # use bundletool to parse the bundle resources.
636    def create_string_map():
637      return ParseBundleResources(options.bundletool_path, options.input_file)
638
639  elif IsFilePathAnApk(options.input_file):
640    if not options.aapt_path:
641      parser.error('--aapt-path <AAPT> is required to parse APKs.')
642
643    # Use aapt dump to parse the APK resources.
644    def create_string_map():
645      return ParseApkResources(options.aapt_path, options.input_file)
646
647  else:
648    parser.error('Unknown file format: %s' % options.input_file)
649
650  # Print everything now.
651  with ManagedOutput(options.output) as output:
652    with ManagedPythonProfiling(options.profile):
653      res_map = create_string_map()
654      res_map.RemapResourceNames(res_id_name_map)
655      lines = AutoIndentStringList(res_map.ToStringList(options.omit_ids))
656      for line in lines:
657        output.write(line)
658        output.write('\n')
659
660
661if __name__ == "__main__":
662  main(sys.argv[1:])
663