1#!/usr/bin/env python3 2# Copyright 2018 The Chromium Authors 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Extracts the unwind tables in from breakpad symbol files 7 8Runs dump_syms on the given binary file and extracts the CFI data into the 9given output file. 10The output file is a binary file containing CFI rows ordered based on function 11address. The output file only contains rows that match the most popular rule 12type in CFI table, to reduce the output size and specify data in compact format. 13See doc https://github.com/google/breakpad/blob/main/docs/symbol_files.md. 141. The CFA rules should be of postfix form "SP <val> +". 152. The RA rules should be of postfix form "CFA <val> + ^". 16Note: breakpad represents dereferencing address with '^' operator. 17 18The output file has 2 tables UNW_INDEX and UNW_DATA, inspired from ARM EHABI 19format. The first table contains function addresses and an index into the 20UNW_DATA table. The second table contains one or more rows for the function 21unwind information. 22 23The output file starts with 4 bytes counting the number of entries in UNW_INDEX. 24Then UNW_INDEX table and UNW_DATA table. 25 26UNW_INDEX contains two columns of N rows each, where N is the number of 27functions. 28 1. First column 4 byte rows of all the function start address as offset from 29 start of the binary, in sorted order. 30 2. For each function addr, the second column contains 2 byte indices in order. 31 The indices are offsets (in count of 2 bytes) of the CFI data from start of 32 UNW_DATA. 33The last entry in the table always contains CANT_UNWIND index to specify the 34end address of the last function. 35 36UNW_DATA contains data of all the functions. Each function data contains N rows. 37The data found at the address pointed from UNW_INDEX will be: 38 2 bytes: N - number of rows that belong to current function. 39 N * 4 bytes: N rows of data. 16 bits : Address offset from function start. 40 14 bits : CFA offset / 4. 41 2 bits : RA offset / 4. 42 43The function is not added to the unwind table in following conditions: 44C1. If length of the function code (number of instructions) is greater than 45 0xFFFF (2 byte address span). This is because we use 16 bits to refer to 46 offset of instruction from start of the address. 47C2. If the function moves the SP by more than 0xFFFF bytes. This is because we 48 use 14 bits to denote CFA offset (last 2 bits are 0). 49C3. If the Return Address is stored at an offset >= 16 from the CFA. Some 50 functions which have variable arguments can have offset upto 16. 51 TODO(ssid): We can actually store offset 16 by subtracting 1 from RA/4 since 52 we never have 0. 53C4: Some functions do not have unwind information defined in dwarf info. These 54 functions have index value CANT_UNWIND(0xFFFF) in UNW_INDEX table. 55 56 57Usage: 58 extract_unwind_tables.py --input_path [root path to unstripped chrome.so] 59 --output_path [output path] --dump_syms_path [path to dump_syms binary] 60""" 61 62import argparse 63import re 64import struct 65import subprocess 66import sys 67import tempfile 68 69 70_CFA_REG = '.cfa' 71_RA_REG = '.ra' 72 73_ADDR_ENTRY = 0 74_LENGTH_ENTRY = 1 75 76_CANT_UNWIND = 0xFFFF 77 78 79def _Write4Bytes(output_file, val): 80 """Writes a 32 bit unsigned integer to the given output file.""" 81 output_file.write(struct.pack('<L', val)); 82 83 84def _Write2Bytes(output_file, val): 85 """Writes a 16 bit unsigned integer to the given output file.""" 86 output_file.write(struct.pack('<H', val)); 87 88 89def _FindRuleForRegister(cfi_row, reg): 90 """Returns the postfix expression as string for a given register. 91 92 Breakpad CFI row format specifies rules for unwinding each register in postfix 93 expression form separated by space. Each rule starts with register name and a 94 colon. Eg: "CFI R1: <rule> R2: <rule>". 95 """ 96 out = [] 97 found_register = False 98 for part in cfi_row: 99 if found_register: 100 if part[-1] == ':': 101 break 102 out.append(part) 103 elif part == reg + ':': 104 found_register = True 105 return ' '.join(out) 106 107 108def _GetCfaAndRaOffset(cfi_row): 109 """Returns a tuple with 2 numbers (cfa_offset, ra_offset). 110 111 Returns right values if rule matches the predefined criteria. Returns (0, 0) 112 otherwise. The criteria for CFA rule is postfix form "SP <val> +" and RA rule 113 is postfix form "CFA -<val> + ^". 114 """ 115 cfa_offset = 0 116 ra_offset = 0 117 cfa_rule = _FindRuleForRegister(cfi_row, _CFA_REG) 118 ra_rule = _FindRuleForRegister(cfi_row, _RA_REG) 119 if cfa_rule and re.match(r'sp [0-9]+ \+', cfa_rule): 120 cfa_offset = int(cfa_rule.split()[1], 10) 121 if ra_rule: 122 if not re.match(r'.cfa -[0-9]+ \+ \^', ra_rule): 123 return (0, 0) 124 ra_offset = -1 * int(ra_rule.split()[1], 10) 125 return (cfa_offset, ra_offset) 126 127 128def _GetAllCfiRows(symbol_file): 129 """Returns parsed CFI data from given symbol_file. 130 131 Each entry in the cfi data dictionary returned is a map from function start 132 address to array of function rows, starting with FUNCTION type, followed by 133 one or more CFI rows. 134 """ 135 cfi_data = {} 136 current_func = [] 137 for line in symbol_file: 138 line = line.decode('utf8') 139 if 'STACK CFI' not in line: 140 continue 141 142 parts = line.split() 143 data = {} 144 if parts[2] == 'INIT': 145 # Add the previous function to the output 146 if len(current_func) > 1: 147 cfi_data[current_func[0][_ADDR_ENTRY]] = current_func 148 current_func = [] 149 150 # The function line is of format "STACK CFI INIT <addr> <length> ..." 151 data[_ADDR_ENTRY] = int(parts[3], 16) 152 data[_LENGTH_ENTRY] = int(parts[4], 16) 153 154 # Condition C1: Skip if length is large. 155 if data[_LENGTH_ENTRY] == 0 or data[_LENGTH_ENTRY] > 0xffff: 156 continue # Skip the current function. 157 else: 158 # The current function is skipped. 159 if len(current_func) == 0: 160 continue 161 162 # The CFI row is of format "STACK CFI <addr> .cfa: <expr> .ra: <expr> ..." 163 data[_ADDR_ENTRY] = int(parts[2], 16) 164 (data[_CFA_REG], data[_RA_REG]) = _GetCfaAndRaOffset(parts) 165 166 # Condition C2 and C3: Skip based on limits on offsets. 167 if data[_CFA_REG] == 0 or data[_RA_REG] >= 16 or data[_CFA_REG] > 0xffff: 168 current_func = [] 169 continue 170 assert data[_CFA_REG] % 4 == 0 171 # Since we skipped functions with code size larger than 0xffff, we should 172 # have no function offset larger than the same value. 173 assert data[_ADDR_ENTRY] - current_func[0][_ADDR_ENTRY] < 0xffff 174 175 if data[_ADDR_ENTRY] == 0: 176 # Skip current function, delete all previous entries. 177 current_func = [] 178 continue 179 assert data[_ADDR_ENTRY] % 2 == 0 180 current_func.append(data) 181 182 # Condition C4: Skip function without CFI rows. 183 if len(current_func) > 1: 184 cfi_data[current_func[0][_ADDR_ENTRY]] = current_func 185 return cfi_data 186 187 188def _WriteCfiData(cfi_data, out_file): 189 """Writes the CFI data in defined format to out_file.""" 190 # Stores the final data that will be written to UNW_DATA table, in order 191 # with 2 byte items. 192 unw_data = [] 193 194 # Represent all the CFI data of functions as set of numbers and map them to an 195 # index in the |unw_data|. This index is later written to the UNW_INDEX table 196 # for each function. This map is used to find index of the data for functions. 197 data_to_index = {} 198 # Store mapping between the functions to the index. 199 func_addr_to_index = {} 200 previous_func_end = 0 201 for addr, function in sorted(cfi_data.items()): 202 # Add an empty function entry when functions CFIs are missing between 2 203 # functions. 204 if previous_func_end != 0 and addr - previous_func_end > 4: 205 func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND 206 previous_func_end = addr + cfi_data[addr][0][_LENGTH_ENTRY] 207 208 assert len(function) > 1 209 func_data_arr = [] 210 func_data = 0 211 # The first row contains the function address and length. The rest of the 212 # rows have CFI data. Create function data array as given in the format. 213 for row in function[1:]: 214 addr_offset = row[_ADDR_ENTRY] - addr 215 cfa_offset = (row[_CFA_REG]) | (row[_RA_REG] // 4) 216 217 func_data_arr.append(addr_offset) 218 func_data_arr.append(cfa_offset) 219 220 # Consider all the rows in the data as one large integer and add it as a key 221 # to the |data_to_index|. 222 for data in func_data_arr: 223 func_data = (func_data << 16) | data 224 225 row_count = len(func_data_arr) // 2 226 if func_data not in data_to_index: 227 # When data is not found, create a new index = len(unw_data), and write 228 # the data to |unw_data|. 229 index = len(unw_data) 230 data_to_index[func_data] = index 231 unw_data.append(row_count) 232 for row in func_data_arr: 233 unw_data.append(row) 234 else: 235 # If the data was found, then use the same index for the function. 236 index = data_to_index[func_data] 237 assert row_count == unw_data[index] 238 func_addr_to_index[addr] = data_to_index[func_data] 239 240 # Mark the end end of last function entry. 241 func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND 242 243 # Write the size of UNW_INDEX file in bytes. 244 _Write4Bytes(out_file, len(func_addr_to_index)) 245 246 # Write the UNW_INDEX table. First list of addresses and then indices. 247 sorted_unw_index = sorted(func_addr_to_index.items()) 248 for addr, index in sorted_unw_index: 249 _Write4Bytes(out_file, addr) 250 for addr, index in sorted_unw_index: 251 _Write2Bytes(out_file, index) 252 253 # Write the UNW_DATA table. 254 for data in unw_data: 255 _Write2Bytes(out_file, data) 256 257 258def main(): 259 parser = argparse.ArgumentParser() 260 parser.add_argument( 261 '--input_path', required=True, 262 help='The input path of the unstripped binary') 263 parser.add_argument( 264 '--output_path', required=True, 265 help='The path of the output file') 266 parser.add_argument( 267 '--dump_syms_path', required=True, 268 help='The path of the dump_syms binary') 269 270 args = parser.parse_args() 271 cmd = ['./' + args.dump_syms_path, args.input_path, '-v'] 272 proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) 273 cfi_data = _GetAllCfiRows(proc.stdout) 274 if proc.wait(): 275 sys.stderr.write('dump_syms exited with code {} after {} symbols\n'.format( 276 proc.returncode, len(cfi_data))) 277 sys.exit(proc.returncode) 278 with open(args.output_path, 'wb') as out_file: 279 _WriteCfiData(cfi_data, out_file) 280 281 282if __name__ == '__main__': 283 main() 284