1*67e74705SXin Li# -*- coding: utf-8 -*- 2*67e74705SXin Li# The LLVM Compiler Infrastructure 3*67e74705SXin Li# 4*67e74705SXin Li# This file is distributed under the University of Illinois Open Source 5*67e74705SXin Li# License. See LICENSE.TXT for details. 6*67e74705SXin Li""" This module is responsible to capture the compiler invocation of any 7*67e74705SXin Libuild process. The result of that should be a compilation database. 8*67e74705SXin Li 9*67e74705SXin LiThis implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES 10*67e74705SXin Limechanisms provided by the dynamic linker. The related library is implemented 11*67e74705SXin Liin C language and can be found under 'libear' directory. 12*67e74705SXin Li 13*67e74705SXin LiThe 'libear' library is capturing all child process creation and logging the 14*67e74705SXin Lirelevant information about it into separate files in a specified directory. 15*67e74705SXin LiThe parameter of this process is the output directory name, where the report 16*67e74705SXin Lifiles shall be placed. This parameter is passed as an environment variable. 17*67e74705SXin Li 18*67e74705SXin LiThe module also implements compiler wrappers to intercept the compiler calls. 19*67e74705SXin Li 20*67e74705SXin LiThe module implements the build command execution and the post-processing of 21*67e74705SXin Lithe output files, which will condensates into a compilation database. """ 22*67e74705SXin Li 23*67e74705SXin Liimport sys 24*67e74705SXin Liimport os 25*67e74705SXin Liimport os.path 26*67e74705SXin Liimport re 27*67e74705SXin Liimport itertools 28*67e74705SXin Liimport json 29*67e74705SXin Liimport glob 30*67e74705SXin Liimport argparse 31*67e74705SXin Liimport logging 32*67e74705SXin Liimport subprocess 33*67e74705SXin Lifrom libear import build_libear, TemporaryDirectory 34*67e74705SXin Lifrom libscanbuild import command_entry_point 35*67e74705SXin Lifrom libscanbuild import duplicate_check, tempdir, initialize_logging 36*67e74705SXin Lifrom libscanbuild.compilation import split_command 37*67e74705SXin Lifrom libscanbuild.shell import encode, decode 38*67e74705SXin Li 39*67e74705SXin Li__all__ = ['capture', 'intercept_build_main', 'intercept_build_wrapper'] 40*67e74705SXin Li 41*67e74705SXin LiGS = chr(0x1d) 42*67e74705SXin LiRS = chr(0x1e) 43*67e74705SXin LiUS = chr(0x1f) 44*67e74705SXin Li 45*67e74705SXin LiCOMPILER_WRAPPER_CC = 'intercept-cc' 46*67e74705SXin LiCOMPILER_WRAPPER_CXX = 'intercept-c++' 47*67e74705SXin Li 48*67e74705SXin Li 49*67e74705SXin Li@command_entry_point 50*67e74705SXin Lidef intercept_build_main(bin_dir): 51*67e74705SXin Li """ Entry point for 'intercept-build' command. """ 52*67e74705SXin Li 53*67e74705SXin Li parser = create_parser() 54*67e74705SXin Li args = parser.parse_args() 55*67e74705SXin Li 56*67e74705SXin Li initialize_logging(args.verbose) 57*67e74705SXin Li logging.debug('Parsed arguments: %s', args) 58*67e74705SXin Li 59*67e74705SXin Li if not args.build: 60*67e74705SXin Li parser.print_help() 61*67e74705SXin Li return 0 62*67e74705SXin Li 63*67e74705SXin Li return capture(args, bin_dir) 64*67e74705SXin Li 65*67e74705SXin Li 66*67e74705SXin Lidef capture(args, bin_dir): 67*67e74705SXin Li """ The entry point of build command interception. """ 68*67e74705SXin Li 69*67e74705SXin Li def post_processing(commands): 70*67e74705SXin Li """ To make a compilation database, it needs to filter out commands 71*67e74705SXin Li which are not compiler calls. Needs to find the source file name 72*67e74705SXin Li from the arguments. And do shell escaping on the command. 73*67e74705SXin Li 74*67e74705SXin Li To support incremental builds, it is desired to read elements from 75*67e74705SXin Li an existing compilation database from a previous run. These elements 76*67e74705SXin Li shall be merged with the new elements. """ 77*67e74705SXin Li 78*67e74705SXin Li # create entries from the current run 79*67e74705SXin Li current = itertools.chain.from_iterable( 80*67e74705SXin Li # creates a sequence of entry generators from an exec, 81*67e74705SXin Li format_entry(command) for command in commands) 82*67e74705SXin Li # read entries from previous run 83*67e74705SXin Li if 'append' in args and args.append and os.path.isfile(args.cdb): 84*67e74705SXin Li with open(args.cdb) as handle: 85*67e74705SXin Li previous = iter(json.load(handle)) 86*67e74705SXin Li else: 87*67e74705SXin Li previous = iter([]) 88*67e74705SXin Li # filter out duplicate entries from both 89*67e74705SXin Li duplicate = duplicate_check(entry_hash) 90*67e74705SXin Li return (entry 91*67e74705SXin Li for entry in itertools.chain(previous, current) 92*67e74705SXin Li if os.path.exists(entry['file']) and not duplicate(entry)) 93*67e74705SXin Li 94*67e74705SXin Li with TemporaryDirectory(prefix='intercept-', dir=tempdir()) as tmp_dir: 95*67e74705SXin Li # run the build command 96*67e74705SXin Li environment = setup_environment(args, tmp_dir, bin_dir) 97*67e74705SXin Li logging.debug('run build in environment: %s', environment) 98*67e74705SXin Li exit_code = subprocess.call(args.build, env=environment) 99*67e74705SXin Li logging.info('build finished with exit code: %d', exit_code) 100*67e74705SXin Li # read the intercepted exec calls 101*67e74705SXin Li exec_traces = itertools.chain.from_iterable( 102*67e74705SXin Li parse_exec_trace(os.path.join(tmp_dir, filename)) 103*67e74705SXin Li for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd')))) 104*67e74705SXin Li # do post processing only if that was requested 105*67e74705SXin Li if 'raw_entries' not in args or not args.raw_entries: 106*67e74705SXin Li entries = post_processing(exec_traces) 107*67e74705SXin Li else: 108*67e74705SXin Li entries = exec_traces 109*67e74705SXin Li # dump the compilation database 110*67e74705SXin Li with open(args.cdb, 'w+') as handle: 111*67e74705SXin Li json.dump(list(entries), handle, sort_keys=True, indent=4) 112*67e74705SXin Li return exit_code 113*67e74705SXin Li 114*67e74705SXin Li 115*67e74705SXin Lidef setup_environment(args, destination, bin_dir): 116*67e74705SXin Li """ Sets up the environment for the build command. 117*67e74705SXin Li 118*67e74705SXin Li It sets the required environment variables and execute the given command. 119*67e74705SXin Li The exec calls will be logged by the 'libear' preloaded library or by the 120*67e74705SXin Li 'wrapper' programs. """ 121*67e74705SXin Li 122*67e74705SXin Li c_compiler = args.cc if 'cc' in args else 'cc' 123*67e74705SXin Li cxx_compiler = args.cxx if 'cxx' in args else 'c++' 124*67e74705SXin Li 125*67e74705SXin Li libear_path = None if args.override_compiler or is_preload_disabled( 126*67e74705SXin Li sys.platform) else build_libear(c_compiler, destination) 127*67e74705SXin Li 128*67e74705SXin Li environment = dict(os.environ) 129*67e74705SXin Li environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination}) 130*67e74705SXin Li 131*67e74705SXin Li if not libear_path: 132*67e74705SXin Li logging.debug('intercept gonna use compiler wrappers') 133*67e74705SXin Li environment.update({ 134*67e74705SXin Li 'CC': os.path.join(bin_dir, COMPILER_WRAPPER_CC), 135*67e74705SXin Li 'CXX': os.path.join(bin_dir, COMPILER_WRAPPER_CXX), 136*67e74705SXin Li 'INTERCEPT_BUILD_CC': c_compiler, 137*67e74705SXin Li 'INTERCEPT_BUILD_CXX': cxx_compiler, 138*67e74705SXin Li 'INTERCEPT_BUILD_VERBOSE': 'DEBUG' if args.verbose > 2 else 'INFO' 139*67e74705SXin Li }) 140*67e74705SXin Li elif sys.platform == 'darwin': 141*67e74705SXin Li logging.debug('intercept gonna preload libear on OSX') 142*67e74705SXin Li environment.update({ 143*67e74705SXin Li 'DYLD_INSERT_LIBRARIES': libear_path, 144*67e74705SXin Li 'DYLD_FORCE_FLAT_NAMESPACE': '1' 145*67e74705SXin Li }) 146*67e74705SXin Li else: 147*67e74705SXin Li logging.debug('intercept gonna preload libear on UNIX') 148*67e74705SXin Li environment.update({'LD_PRELOAD': libear_path}) 149*67e74705SXin Li 150*67e74705SXin Li return environment 151*67e74705SXin Li 152*67e74705SXin Li 153*67e74705SXin Lidef intercept_build_wrapper(cplusplus): 154*67e74705SXin Li """ Entry point for `intercept-cc` and `intercept-c++` compiler wrappers. 155*67e74705SXin Li 156*67e74705SXin Li It does generate execution report into target directory. And execute 157*67e74705SXin Li the wrapped compilation with the real compiler. The parameters for 158*67e74705SXin Li report and execution are from environment variables. 159*67e74705SXin Li 160*67e74705SXin Li Those parameters which for 'libear' library can't have meaningful 161*67e74705SXin Li values are faked. """ 162*67e74705SXin Li 163*67e74705SXin Li # initialize wrapper logging 164*67e74705SXin Li logging.basicConfig(format='intercept: %(levelname)s: %(message)s', 165*67e74705SXin Li level=os.getenv('INTERCEPT_BUILD_VERBOSE', 'INFO')) 166*67e74705SXin Li # write report 167*67e74705SXin Li try: 168*67e74705SXin Li target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR') 169*67e74705SXin Li if not target_dir: 170*67e74705SXin Li raise UserWarning('exec report target directory not found') 171*67e74705SXin Li pid = str(os.getpid()) 172*67e74705SXin Li target_file = os.path.join(target_dir, pid + '.cmd') 173*67e74705SXin Li logging.debug('writing exec report to: %s', target_file) 174*67e74705SXin Li with open(target_file, 'ab') as handler: 175*67e74705SXin Li working_dir = os.getcwd() 176*67e74705SXin Li command = US.join(sys.argv) + US 177*67e74705SXin Li content = RS.join([pid, pid, 'wrapper', working_dir, command]) + GS 178*67e74705SXin Li handler.write(content.encode('utf-8')) 179*67e74705SXin Li except IOError: 180*67e74705SXin Li logging.exception('writing exec report failed') 181*67e74705SXin Li except UserWarning as warning: 182*67e74705SXin Li logging.warning(warning) 183*67e74705SXin Li # execute with real compiler 184*67e74705SXin Li compiler = os.getenv('INTERCEPT_BUILD_CXX', 'c++') if cplusplus \ 185*67e74705SXin Li else os.getenv('INTERCEPT_BUILD_CC', 'cc') 186*67e74705SXin Li compilation = [compiler] + sys.argv[1:] 187*67e74705SXin Li logging.debug('execute compiler: %s', compilation) 188*67e74705SXin Li return subprocess.call(compilation) 189*67e74705SXin Li 190*67e74705SXin Li 191*67e74705SXin Lidef parse_exec_trace(filename): 192*67e74705SXin Li """ Parse the file generated by the 'libear' preloaded library. 193*67e74705SXin Li 194*67e74705SXin Li Given filename points to a file which contains the basic report 195*67e74705SXin Li generated by the interception library or wrapper command. A single 196*67e74705SXin Li report file _might_ contain multiple process creation info. """ 197*67e74705SXin Li 198*67e74705SXin Li logging.debug('parse exec trace file: %s', filename) 199*67e74705SXin Li with open(filename, 'r') as handler: 200*67e74705SXin Li content = handler.read() 201*67e74705SXin Li for group in filter(bool, content.split(GS)): 202*67e74705SXin Li records = group.split(RS) 203*67e74705SXin Li yield { 204*67e74705SXin Li 'pid': records[0], 205*67e74705SXin Li 'ppid': records[1], 206*67e74705SXin Li 'function': records[2], 207*67e74705SXin Li 'directory': records[3], 208*67e74705SXin Li 'command': records[4].split(US)[:-1] 209*67e74705SXin Li } 210*67e74705SXin Li 211*67e74705SXin Li 212*67e74705SXin Lidef format_entry(exec_trace): 213*67e74705SXin Li """ Generate the desired fields for compilation database entries. """ 214*67e74705SXin Li 215*67e74705SXin Li def abspath(cwd, name): 216*67e74705SXin Li """ Create normalized absolute path from input filename. """ 217*67e74705SXin Li fullname = name if os.path.isabs(name) else os.path.join(cwd, name) 218*67e74705SXin Li return os.path.normpath(fullname) 219*67e74705SXin Li 220*67e74705SXin Li logging.debug('format this command: %s', exec_trace['command']) 221*67e74705SXin Li compilation = split_command(exec_trace['command']) 222*67e74705SXin Li if compilation: 223*67e74705SXin Li for source in compilation.files: 224*67e74705SXin Li compiler = 'c++' if compilation.compiler == 'c++' else 'cc' 225*67e74705SXin Li command = [compiler, '-c'] + compilation.flags + [source] 226*67e74705SXin Li logging.debug('formated as: %s', command) 227*67e74705SXin Li yield { 228*67e74705SXin Li 'directory': exec_trace['directory'], 229*67e74705SXin Li 'command': encode(command), 230*67e74705SXin Li 'file': abspath(exec_trace['directory'], source) 231*67e74705SXin Li } 232*67e74705SXin Li 233*67e74705SXin Li 234*67e74705SXin Lidef is_preload_disabled(platform): 235*67e74705SXin Li """ Library-based interposition will fail silently if SIP is enabled, 236*67e74705SXin Li so this should be detected. You can detect whether SIP is enabled on 237*67e74705SXin Li Darwin by checking whether (1) there is a binary called 'csrutil' in 238*67e74705SXin Li the path and, if so, (2) whether the output of executing 'csrutil status' 239*67e74705SXin Li contains 'System Integrity Protection status: enabled'. 240*67e74705SXin Li 241*67e74705SXin Li Same problem on linux when SELinux is enabled. The status query program 242*67e74705SXin Li 'sestatus' and the output when it's enabled 'SELinux status: enabled'. """ 243*67e74705SXin Li 244*67e74705SXin Li if platform == 'darwin': 245*67e74705SXin Li pattern = re.compile(r'System Integrity Protection status:\s+enabled') 246*67e74705SXin Li command = ['csrutil', 'status'] 247*67e74705SXin Li elif platform in {'linux', 'linux2'}: 248*67e74705SXin Li pattern = re.compile(r'SELinux status:\s+enabled') 249*67e74705SXin Li command = ['sestatus'] 250*67e74705SXin Li else: 251*67e74705SXin Li return False 252*67e74705SXin Li 253*67e74705SXin Li try: 254*67e74705SXin Li lines = subprocess.check_output(command).decode('utf-8') 255*67e74705SXin Li return any((pattern.match(line) for line in lines.splitlines())) 256*67e74705SXin Li except: 257*67e74705SXin Li return False 258*67e74705SXin Li 259*67e74705SXin Li 260*67e74705SXin Lidef entry_hash(entry): 261*67e74705SXin Li """ Implement unique hash method for compilation database entries. """ 262*67e74705SXin Li 263*67e74705SXin Li # For faster lookup in set filename is reverted 264*67e74705SXin Li filename = entry['file'][::-1] 265*67e74705SXin Li # For faster lookup in set directory is reverted 266*67e74705SXin Li directory = entry['directory'][::-1] 267*67e74705SXin Li # On OS X the 'cc' and 'c++' compilers are wrappers for 268*67e74705SXin Li # 'clang' therefore both call would be logged. To avoid 269*67e74705SXin Li # this the hash does not contain the first word of the 270*67e74705SXin Li # command. 271*67e74705SXin Li command = ' '.join(decode(entry['command'])[1:]) 272*67e74705SXin Li 273*67e74705SXin Li return '<>'.join([filename, directory, command]) 274*67e74705SXin Li 275*67e74705SXin Li 276*67e74705SXin Lidef create_parser(): 277*67e74705SXin Li """ Command line argument parser factory method. """ 278*67e74705SXin Li 279*67e74705SXin Li parser = argparse.ArgumentParser( 280*67e74705SXin Li formatter_class=argparse.ArgumentDefaultsHelpFormatter) 281*67e74705SXin Li 282*67e74705SXin Li parser.add_argument( 283*67e74705SXin Li '--verbose', '-v', 284*67e74705SXin Li action='count', 285*67e74705SXin Li default=0, 286*67e74705SXin Li help="""Enable verbose output from '%(prog)s'. A second and third 287*67e74705SXin Li flag increases verbosity.""") 288*67e74705SXin Li parser.add_argument( 289*67e74705SXin Li '--cdb', 290*67e74705SXin Li metavar='<file>', 291*67e74705SXin Li default="compile_commands.json", 292*67e74705SXin Li help="""The JSON compilation database.""") 293*67e74705SXin Li group = parser.add_mutually_exclusive_group() 294*67e74705SXin Li group.add_argument( 295*67e74705SXin Li '--append', 296*67e74705SXin Li action='store_true', 297*67e74705SXin Li help="""Append new entries to existing compilation database.""") 298*67e74705SXin Li group.add_argument( 299*67e74705SXin Li '--disable-filter', '-n', 300*67e74705SXin Li dest='raw_entries', 301*67e74705SXin Li action='store_true', 302*67e74705SXin Li help="""Intercepted child process creation calls (exec calls) are all 303*67e74705SXin Li logged to the output. The output is not a compilation database. 304*67e74705SXin Li This flag is for debug purposes.""") 305*67e74705SXin Li 306*67e74705SXin Li advanced = parser.add_argument_group('advanced options') 307*67e74705SXin Li advanced.add_argument( 308*67e74705SXin Li '--override-compiler', 309*67e74705SXin Li action='store_true', 310*67e74705SXin Li help="""Always resort to the compiler wrapper even when better 311*67e74705SXin Li intercept methods are available.""") 312*67e74705SXin Li advanced.add_argument( 313*67e74705SXin Li '--use-cc', 314*67e74705SXin Li metavar='<path>', 315*67e74705SXin Li dest='cc', 316*67e74705SXin Li default='cc', 317*67e74705SXin Li help="""When '%(prog)s' analyzes a project by interposing a compiler 318*67e74705SXin Li wrapper, which executes a real compiler for compilation and 319*67e74705SXin Li do other tasks (record the compiler invocation). Because of 320*67e74705SXin Li this interposing, '%(prog)s' does not know what compiler your 321*67e74705SXin Li project normally uses. Instead, it simply overrides the CC 322*67e74705SXin Li environment variable, and guesses your default compiler. 323*67e74705SXin Li 324*67e74705SXin Li If you need '%(prog)s' to use a specific compiler for 325*67e74705SXin Li *compilation* then you can use this option to specify a path 326*67e74705SXin Li to that compiler.""") 327*67e74705SXin Li advanced.add_argument( 328*67e74705SXin Li '--use-c++', 329*67e74705SXin Li metavar='<path>', 330*67e74705SXin Li dest='cxx', 331*67e74705SXin Li default='c++', 332*67e74705SXin Li help="""This is the same as "--use-cc" but for C++ code.""") 333*67e74705SXin Li 334*67e74705SXin Li parser.add_argument( 335*67e74705SXin Li dest='build', 336*67e74705SXin Li nargs=argparse.REMAINDER, 337*67e74705SXin Li help="""Command to run.""") 338*67e74705SXin Li 339*67e74705SXin Li return parser 340