1#!/usr/bin/env vpython3 2# encoding: utf-8 3# Copyright 2019 The Chromium Authors 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""A script to parse and dump localized strings in resource.arsc files.""" 8 9 10import argparse 11import collections 12import contextlib 13import cProfile 14import os 15import re 16import subprocess 17import sys 18import zipfile 19 20# pylint: disable=bare-except 21 22# Assuming this script is located under build/android, try to import 23# build/android/gyp/bundletool.py to get the default path to the bundletool 24# jar file. If this fail, using --bundletool-path will be required to parse 25# bundles, allowing this script to be relocated or reused somewhere else. 26try: 27 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'gyp')) 28 import bundletool 29 30 _DEFAULT_BUNDLETOOL_PATH = bundletool.BUNDLETOOL_JAR_PATH 31except: 32 _DEFAULT_BUNDLETOOL_PATH = None 33 34# Try to get the path of the aapt build tool from catapult/devil. 35try: 36 import devil_chromium # pylint: disable=unused-import 37 from devil.android.sdk import build_tools 38 _AAPT_DEFAULT_PATH = build_tools.GetPath('aapt') 39except: 40 _AAPT_DEFAULT_PATH = None 41 42 43def AutoIndentStringList(lines, indentation=2): 44 """Auto-indents a input list of text lines, based on open/closed braces. 45 46 For example, the following input text: 47 48 'Foo {', 49 'Bar {', 50 'Zoo', 51 '}', 52 '}', 53 54 Will return the following: 55 56 'Foo {', 57 ' Bar {', 58 ' Zoo', 59 ' }', 60 '}', 61 62 The rules are pretty simple: 63 - A line that ends with an open brace ({) increments indentation. 64 - A line that starts with a closing brace (}) decrements it. 65 66 The main idea is to make outputting structured text data trivial, 67 since it can be assumed that the final output will be passed through 68 this function to make it human-readable. 69 70 Args: 71 lines: an iterator over input text lines. They should not contain 72 line terminator (e.g. '\n'). 73 Returns: 74 A new list of text lines, properly auto-indented. 75 """ 76 margin = '' 77 result = [] 78 # NOTE: Intentional but significant speed optimizations in this function: 79 # - |line and line[0] == <char>| instead of |line.startswith(<char>)|. 80 # - |line and line[-1] == <char>| instead of |line.endswith(<char>)|. 81 for line in lines: 82 if line and line[0] == '}': 83 margin = margin[:-indentation] 84 result.append(margin + line) 85 if line and line[-1] == '{': 86 margin += ' ' * indentation 87 88 return result 89 90 91# pylint: disable=line-too-long 92 93# NOTE: aapt dump will quote the following characters only: \n, \ and " 94# see https://cs.android.com/search?q=f:ResourceTypes.cpp 95 96# pylint: enable=line-too-long 97 98 99def UnquoteString(s): 100 """Unquote a given string from aapt dump. 101 102 Args: 103 s: An UTF-8 encoded string that contains backslashes for quotes, as found 104 in the output of 'aapt dump resources --values'. 105 Returns: 106 The unquoted version of the input string. 107 """ 108 if not '\\' in s: 109 return s 110 111 result = '' 112 start = 0 113 size = len(s) 114 while start < size: 115 pos = s.find('\\', start) 116 if pos < 0: 117 break 118 119 result += s[start:pos] 120 count = 1 121 while pos + count < size and s[pos + count] == '\\': 122 count += 1 123 124 result += '\\' * (count // 2) 125 start = pos + count 126 if count & 1: 127 if start < size: 128 ch = s[start] 129 if ch == 'n': # \n is the only non-printable character supported. 130 ch = '\n' 131 result += ch 132 start += 1 133 else: 134 result += '\\' 135 136 result += s[start:] 137 return result 138 139 140assert UnquoteString(r'foo bar') == 'foo bar' 141assert UnquoteString(r'foo\nbar') == 'foo\nbar' 142assert UnquoteString(r'foo\\nbar') == 'foo\\nbar' 143assert UnquoteString(r'foo\\\nbar') == 'foo\\\nbar' 144assert UnquoteString(r'foo\n\nbar') == 'foo\n\nbar' 145assert UnquoteString(r'foo\\bar') == r'foo\bar' 146 147 148def QuoteString(s): 149 """Quote a given string for external output. 150 151 Args: 152 s: An input UTF-8 encoded string. 153 Returns: 154 A quoted version of the string, using the same rules as 'aapt dump'. 155 """ 156 # NOTE: Using repr() would escape all non-ASCII bytes in the string, which 157 # is undesirable. 158 return s.replace('\\', r'\\').replace('"', '\\"').replace('\n', '\\n') 159 160 161assert QuoteString(r'foo "bar"') == 'foo \\"bar\\"' 162assert QuoteString('foo\nbar') == 'foo\\nbar' 163 164 165def ReadStringMapFromRTxt(r_txt_path): 166 """Read all string resource IDs and names from an R.txt file. 167 168 Args: 169 r_txt_path: Input file path. 170 Returns: 171 A {res_id -> res_name} dictionary corresponding to the string resources 172 from the input R.txt file. 173 """ 174 # NOTE: Typical line of interest looks like: 175 # int string AllowedDomainsForAppsTitle 0x7f130001 176 result = {} 177 prefix = 'int string ' 178 with open(r_txt_path) as f: 179 for line in f: 180 line = line.rstrip() 181 if line.startswith(prefix): 182 res_name, res_id = line[len(prefix):].split(' ') 183 result[int(res_id, 0)] = res_name 184 return result 185 186 187class ResourceStringValues: 188 """Models all possible values for a named string.""" 189 190 def __init__(self): 191 self.res_name = None 192 self.res_values = {} 193 194 def AddValue(self, res_name, res_config, res_value): 195 """Add a new value to this entry. 196 197 Args: 198 res_name: Resource name. If this is not the first time this method 199 is called with the same resource name, then |res_name| should match 200 previous parameters for sanity checking. 201 res_config: Config associated with this value. This can actually be 202 anything that can be converted to a string. 203 res_value: UTF-8 encoded string value. 204 """ 205 if res_name is not self.res_name and res_name != self.res_name: 206 if self.res_name is None: 207 self.res_name = res_name 208 else: 209 # Sanity check: the resource name should be the same for all chunks. 210 # Resource ID is redefined with a different name!! 211 print('WARNING: Resource key ignored (%s, should be %s)' % 212 (res_name, self.res_name)) 213 214 if self.res_values.setdefault(res_config, res_value) is not res_value: 215 print('WARNING: Duplicate value definition for [config %s]: %s ' \ 216 '(already has %s)' % ( 217 res_config, res_value, self.res_values[res_config])) 218 219 def ToStringList(self, res_id): 220 """Convert entry to string list for human-friendly output.""" 221 values = sorted([(str(config), value) 222 for config, value in self.res_values.items()]) 223 if res_id is None: 224 # res_id will be None when the resource ID should not be part 225 # of the output. 226 result = ['name=%s count=%d {' % (self.res_name, len(values))] 227 else: 228 result = [ 229 'res_id=0x%08x name=%s count=%d {' % (res_id, self.res_name, 230 len(values)) 231 ] 232 for config, value in values: 233 result.append('%-16s "%s"' % (config, QuoteString(value))) 234 result.append('}') 235 return result 236 237 238class ResourceStringMap: 239 """Convenience class to hold the set of all localized strings in a table. 240 241 Usage is the following: 242 1) Create new (empty) instance. 243 2) Call AddValue() repeatedly to add new values. 244 3) Eventually call RemapResourceNames() to remap resource names. 245 4) Call ToStringList() to convert the instance to a human-readable 246 list of strings that can later be used with AutoIndentStringList() 247 for example. 248 """ 249 250 def __init__(self): 251 self._res_map = collections.defaultdict(ResourceStringValues) 252 253 def AddValue(self, res_id, res_name, res_config, res_value): 254 self._res_map[res_id].AddValue(res_name, res_config, res_value) 255 256 def RemapResourceNames(self, id_name_map): 257 """Rename all entries according to a given {res_id -> res_name} map.""" 258 for res_id, res_name in id_name_map.items(): 259 if res_id in self._res_map: 260 self._res_map[res_id].res_name = res_name 261 262 def ToStringList(self, omit_ids=False): 263 """Dump content to a human-readable string list. 264 265 Note that the strings are ordered by their resource name first, and 266 resource id second. 267 268 Args: 269 omit_ids: If True, do not put resource IDs in the result. This might 270 be useful when comparing the outputs of two different builds of the 271 same APK, or two related APKs (e.g. ChromePublic.apk vs Chrome.apk) 272 where the resource IDs might be slightly different, but not the 273 string contents. 274 Return: 275 A list of strings that can later be sent to AutoIndentStringList(). 276 """ 277 result = ['Resource strings (count=%d) {' % len(self._res_map)] 278 res_map = self._res_map 279 280 # Compare two (res_id, values) tuples by resource name first, then resource 281 # ID. 282 for res_id, _ in sorted(res_map.items(), 283 key=lambda x: (x[1].res_name, x[0])): 284 result += res_map[res_id].ToStringList(None if omit_ids else res_id) 285 result.append('} # Resource strings') 286 return result 287 288 289@contextlib.contextmanager 290def ManagedOutput(output_file): 291 """Create an output File object that will be closed on exit if necessary. 292 293 Args: 294 output_file: Optional output file path. 295 Yields: 296 If |output_file| is empty, this simply yields sys.stdout. Otherwise, this 297 opens the file path for writing text, and yields its File object. The 298 context will ensure that the object is always closed on scope exit. 299 """ 300 close_output = False 301 if output_file: 302 output = open(output_file, 'wt') 303 close_output = True 304 else: 305 output = sys.stdout 306 try: 307 yield output 308 finally: 309 if close_output: 310 output.close() 311 312 313@contextlib.contextmanager 314def ManagedPythonProfiling(enable_profiling, sort_key='tottime'): 315 """Enable Python profiling if needed. 316 317 Args: 318 enable_profiling: Boolean flag. True to enable python profiling. 319 sort_key: Sorting key for the final stats dump. 320 Yields: 321 If |enable_profiling| is False, this yields False. Otherwise, this 322 yields a new Profile instance just after enabling it. The manager 323 ensures that profiling stops and prints statistics on scope exit. 324 """ 325 pr = None 326 if enable_profiling: 327 pr = cProfile.Profile() 328 pr.enable() 329 try: 330 yield pr 331 finally: 332 if pr: 333 pr.disable() 334 pr.print_stats(sort=sort_key) 335 336 337def IsFilePathABundle(input_file): 338 """Return True iff |input_file| holds an Android app bundle.""" 339 try: 340 with zipfile.ZipFile(input_file) as input_zip: 341 _ = input_zip.getinfo('BundleConfig.pb') 342 return True 343 except: 344 return False 345 346 347# Example output from 'bundletool dump resources --values' corresponding 348# to strings: 349# 350# 0x7F1200A0 - string/abc_action_menu_overflow_description 351# (default) - [STR] "More options" 352# locale: "ca" - [STR] "Més opcions" 353# locale: "da" - [STR] "Flere muligheder" 354# locale: "fa" - [STR] " گزینه<U+200C>های بیشتر" 355# locale: "ja" - [STR] "その他のオプション" 356# locale: "ta" - [STR] "மேலும் விருப்பங்கள்" 357# locale: "nb" - [STR] "Flere alternativer" 358# ... 359# 360# Fun fact #1: Bundletool uses <lang>-<REGION> instead of <lang>-r<REGION> 361# for locales! 362# 363# Fun fact #2: The <U+200C> is terminal output for \u200c, the output is 364# really UTF-8 encoded when it is read by this script. 365# 366# Fun fact #3: Bundletool quotes \n, \\ and \" just like aapt since 0.8.0. 367# 368_RE_BUNDLE_STRING_RESOURCE_HEADER = re.compile( 369 r'^0x([0-9A-F]+)\s\-\sstring/(\w+)$') 370assert _RE_BUNDLE_STRING_RESOURCE_HEADER.match( 371 '0x7F1200A0 - string/abc_action_menu_overflow_description') 372 373_RE_BUNDLE_STRING_DEFAULT_VALUE = re.compile( 374 r'^\s+\(default\) - \[STR\] "(.*)"$') 375assert _RE_BUNDLE_STRING_DEFAULT_VALUE.match( 376 ' (default) - [STR] "More options"') 377assert _RE_BUNDLE_STRING_DEFAULT_VALUE.match( 378 ' (default) - [STR] "More options"').group(1) == "More options" 379 380_RE_BUNDLE_STRING_LOCALIZED_VALUE = re.compile( 381 r'^\s+locale: "([0-9a-zA-Z-]+)" - \[STR\] "(.*)"$') 382assert _RE_BUNDLE_STRING_LOCALIZED_VALUE.match( 383 ' locale: "ar" - [STR] "گزینه\u200cهای بیشتر"') 384 385 386def ParseBundleResources(bundle_tool_jar_path, bundle_path): 387 """Use bundletool to extract the localized strings of a given bundle. 388 389 Args: 390 bundle_tool_jar_path: Path to bundletool .jar executable. 391 bundle_path: Path to input bundle. 392 Returns: 393 A new ResourceStringMap instance populated with the bundle's content. 394 """ 395 cmd_args = [ 396 'java', '-jar', bundle_tool_jar_path, 'dump', 'resources', '--bundle', 397 bundle_path, '--values' 398 ] 399 p = subprocess.Popen(cmd_args, bufsize=1, stdout=subprocess.PIPE) 400 res_map = ResourceStringMap() 401 current_resource_id = None 402 current_resource_name = None 403 keep_parsing = True 404 need_value = False 405 while keep_parsing: 406 line = p.stdout.readline() 407 if not line: 408 break 409 # Do not use rstrip(), since this should only remove trailing newlines 410 # but not trailing whitespace that happen to be embedded in the string 411 # value for some reason. 412 line = line.rstrip('\n\r') 413 m = _RE_BUNDLE_STRING_RESOURCE_HEADER.match(line) 414 if m: 415 current_resource_id = int(m.group(1), 16) 416 current_resource_name = m.group(2) 417 need_value = True 418 continue 419 420 if not need_value: 421 continue 422 423 resource_config = None 424 m = _RE_BUNDLE_STRING_DEFAULT_VALUE.match(line) 425 if m: 426 resource_config = 'config (default)' 427 resource_value = m.group(1) 428 else: 429 m = _RE_BUNDLE_STRING_LOCALIZED_VALUE.match(line) 430 if m: 431 resource_config = 'config %s' % m.group(1) 432 resource_value = m.group(2) 433 434 if resource_config is None: 435 need_value = False 436 continue 437 438 res_map.AddValue(current_resource_id, current_resource_name, 439 resource_config, UnquoteString(resource_value)) 440 return res_map 441 442 443# Name of the binary resources table file inside an APK. 444RESOURCES_FILENAME = 'resources.arsc' 445 446 447def IsFilePathAnApk(input_file): 448 """Returns True iff a ZipFile instance is for a regular APK.""" 449 try: 450 with zipfile.ZipFile(input_file) as input_zip: 451 _ = input_zip.getinfo(RESOURCES_FILENAME) 452 return True 453 except: 454 return False 455 456 457# pylint: disable=line-too-long 458 459# Example output from 'aapt dump resources --values' corresponding 460# to strings: 461# 462# config zh-rHK 463# resource 0x7f12009c org.chromium.chrome:string/0_resource_name_obfuscated: t=0x03 d=0x0000caa9 (s=0x0008 r=0x00) 464# (string8) "瀏覽首頁" 465# resource 0x7f12009d org.chromium.chrome:string/0_resource_name_obfuscated: t=0x03 d=0x0000c8e0 (s=0x0008 r=0x00) 466# (string8) "向上瀏覽" 467# 468 469# The following are compiled regular expressions used to recognize each 470# of line and extract relevant information. 471# 472_RE_AAPT_CONFIG = re.compile(r'^\s+config (.+):$') 473assert _RE_AAPT_CONFIG.match(' config (default):') 474assert _RE_AAPT_CONFIG.match(' config zh-rTW:') 475 476# Match an ISO 639-1 or ISO 639-2 locale. 477_RE_AAPT_ISO_639_LOCALE = re.compile(r'^[a-z]{2,3}(-r[A-Z]{2,3})?$') 478assert _RE_AAPT_ISO_639_LOCALE.match('de') 479assert _RE_AAPT_ISO_639_LOCALE.match('zh-rTW') 480assert _RE_AAPT_ISO_639_LOCALE.match('fil') 481assert not _RE_AAPT_ISO_639_LOCALE.match('land') 482 483_RE_AAPT_BCP47_LOCALE = re.compile(r'^b\+[a-z][a-zA-Z0-9\+]+$') 484assert _RE_AAPT_BCP47_LOCALE.match('b+sr') 485assert _RE_AAPT_BCP47_LOCALE.match('b+sr+Latn') 486assert _RE_AAPT_BCP47_LOCALE.match('b+en+US') 487assert not _RE_AAPT_BCP47_LOCALE.match('b+') 488assert not _RE_AAPT_BCP47_LOCALE.match('b+1234') 489 490_RE_AAPT_STRING_RESOURCE_HEADER = re.compile( 491 r'^\s+resource 0x([0-9a-f]+) [a-zA-Z][a-zA-Z0-9.]+:string/(\w+):.*$') 492assert _RE_AAPT_STRING_RESOURCE_HEADER.match( 493 r' resource 0x7f12009c org.chromium.chrome:string/0_resource_name_obfuscated: t=0x03 d=0x0000caa9 (s=0x0008 r=0x00)' 494) 495 496_RE_AAPT_STRING_RESOURCE_VALUE = re.compile(r'^\s+\(string8\) "(.*)"$') 497assert _RE_AAPT_STRING_RESOURCE_VALUE.match(r' (string8) "瀏覽首頁"') 498 499# pylint: enable=line-too-long 500 501 502def _ConvertAaptLocaleToBcp47(locale): 503 """Convert a locale name from 'aapt dump' to its BCP-47 form.""" 504 if locale.startswith('b+'): 505 return '-'.join(locale[2:].split('+')) 506 lang, _, region = locale.partition('-r') 507 if region: 508 return '%s-%s' % (lang, region) 509 return lang 510 511 512assert _ConvertAaptLocaleToBcp47('(default)') == '(default)' 513assert _ConvertAaptLocaleToBcp47('en') == 'en' 514assert _ConvertAaptLocaleToBcp47('en-rUS') == 'en-US' 515assert _ConvertAaptLocaleToBcp47('en-US') == 'en-US' 516assert _ConvertAaptLocaleToBcp47('fil') == 'fil' 517assert _ConvertAaptLocaleToBcp47('b+sr+Latn') == 'sr-Latn' 518 519 520def ParseApkResources(aapt_path, apk_path): 521 """Use aapt to extract the localized strings of a given bundle. 522 523 Args: 524 bundle_tool_jar_path: Path to bundletool .jar executable. 525 bundle_path: Path to input bundle. 526 Returns: 527 A new ResourceStringMap instance populated with the bundle's content. 528 """ 529 cmd_args = [aapt_path, 'dump', '--values', 'resources', apk_path] 530 p = subprocess.Popen(cmd_args, bufsize=1, stdout=subprocess.PIPE) 531 532 res_map = ResourceStringMap() 533 current_locale = None 534 current_resource_id = -1 # represents undefined. 535 current_resource_name = None 536 need_value = False 537 while True: 538 try: 539 line = p.stdout.readline().rstrip().decode('utf8') 540 except UnicodeDecodeError: 541 continue 542 543 if not line: 544 break 545 m = _RE_AAPT_CONFIG.match(line) 546 if m: 547 locale = None 548 aapt_locale = m.group(1) 549 if aapt_locale == '(default)': 550 locale = aapt_locale 551 elif _RE_AAPT_ISO_639_LOCALE.match(aapt_locale): 552 locale = aapt_locale 553 elif _RE_AAPT_BCP47_LOCALE.match(aapt_locale): 554 locale = aapt_locale 555 if locale is not None: 556 current_locale = _ConvertAaptLocaleToBcp47(locale) 557 continue 558 559 if current_locale is None: 560 continue 561 562 if need_value: 563 m = _RE_AAPT_STRING_RESOURCE_VALUE.match(line) 564 if not m: 565 # Should not happen 566 sys.stderr.write('WARNING: Missing value for string ID 0x%08x "%s"' % 567 (current_resource_id, current_resource_name)) 568 resource_value = '<MISSING_STRING_%08x>' % current_resource_id 569 else: 570 resource_value = UnquoteString(m.group(1)) 571 572 res_map.AddValue(current_resource_id, current_resource_name, 573 'config %s' % current_locale, resource_value) 574 need_value = False 575 else: 576 m = _RE_AAPT_STRING_RESOURCE_HEADER.match(line) 577 if m: 578 current_resource_id = int(m.group(1), 16) 579 current_resource_name = m.group(2) 580 need_value = True 581 582 return res_map 583 584 585def main(args): 586 parser = argparse.ArgumentParser( 587 description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) 588 parser.add_argument( 589 'input_file', 590 help='Input file path. This can be either an APK, or an app bundle.') 591 parser.add_argument('--output', help='Optional output file path.') 592 parser.add_argument( 593 '--omit-ids', 594 action='store_true', 595 help='Omit resource IDs in the output. This is useful ' 596 'to compare the contents of two distinct builds of the ' 597 'same APK.') 598 parser.add_argument( 599 '--aapt-path', 600 default=_AAPT_DEFAULT_PATH, 601 help='Path to aapt executable. Optional for APKs.') 602 parser.add_argument( 603 '--r-txt-path', 604 help='Path to an optional input R.txt file used to translate resource ' 605 'IDs to string names. Useful when resources names in the input files ' 606 'were obfuscated. NOTE: If ${INPUT_FILE}.R.txt exists, if will be used ' 607 'automatically by this script.') 608 parser.add_argument( 609 '--bundletool-path', 610 default=_DEFAULT_BUNDLETOOL_PATH, 611 help='Path to alternate bundletool .jar file. Only used for bundles.') 612 parser.add_argument( 613 '--profile', action='store_true', help='Enable Python profiling.') 614 615 options = parser.parse_args(args) 616 617 # Create a {res_id -> res_name} map for unobfuscation, if needed. 618 res_id_name_map = {} 619 r_txt_path = options.r_txt_path 620 if not r_txt_path: 621 candidate_r_txt_path = options.input_file + '.R.txt' 622 if os.path.exists(candidate_r_txt_path): 623 r_txt_path = candidate_r_txt_path 624 625 if r_txt_path: 626 res_id_name_map = ReadStringMapFromRTxt(r_txt_path) 627 628 # Create a helper lambda that creates a new ResourceStringMap instance 629 # based on the input file's type. 630 if IsFilePathABundle(options.input_file): 631 if not options.bundletool_path: 632 parser.error( 633 '--bundletool-path <BUNDLETOOL_JAR> is required to parse bundles.') 634 635 # use bundletool to parse the bundle resources. 636 def create_string_map(): 637 return ParseBundleResources(options.bundletool_path, options.input_file) 638 639 elif IsFilePathAnApk(options.input_file): 640 if not options.aapt_path: 641 parser.error('--aapt-path <AAPT> is required to parse APKs.') 642 643 # Use aapt dump to parse the APK resources. 644 def create_string_map(): 645 return ParseApkResources(options.aapt_path, options.input_file) 646 647 else: 648 parser.error('Unknown file format: %s' % options.input_file) 649 650 # Print everything now. 651 with ManagedOutput(options.output) as output: 652 with ManagedPythonProfiling(options.profile): 653 res_map = create_string_map() 654 res_map.RemapResourceNames(res_id_name_map) 655 lines = AutoIndentStringList(res_map.ToStringList(options.omit_ids)) 656 for line in lines: 657 output.write(line) 658 output.write('\n') 659 660 661if __name__ == "__main__": 662 main(sys.argv[1:]) 663