xref: /aosp_15_r20/external/clang/tools/scan-build-py/libscanbuild/intercept.py (revision 67e74705e28f6214e480b399dd47ea732279e315)
1*67e74705SXin Li# -*- coding: utf-8 -*-
2*67e74705SXin Li#                     The LLVM Compiler Infrastructure
3*67e74705SXin Li#
4*67e74705SXin Li# This file is distributed under the University of Illinois Open Source
5*67e74705SXin Li# License. See LICENSE.TXT for details.
6*67e74705SXin Li""" This module is responsible to capture the compiler invocation of any
7*67e74705SXin Libuild process. The result of that should be a compilation database.
8*67e74705SXin Li
9*67e74705SXin LiThis implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES
10*67e74705SXin Limechanisms provided by the dynamic linker. The related library is implemented
11*67e74705SXin Liin C language and can be found under 'libear' directory.
12*67e74705SXin Li
13*67e74705SXin LiThe 'libear' library is capturing all child process creation and logging the
14*67e74705SXin Lirelevant information about it into separate files in a specified directory.
15*67e74705SXin LiThe parameter of this process is the output directory name, where the report
16*67e74705SXin Lifiles shall be placed. This parameter is passed as an environment variable.
17*67e74705SXin Li
18*67e74705SXin LiThe module also implements compiler wrappers to intercept the compiler calls.
19*67e74705SXin Li
20*67e74705SXin LiThe module implements the build command execution and the post-processing of
21*67e74705SXin Lithe output files, which will condensates into a compilation database. """
22*67e74705SXin Li
23*67e74705SXin Liimport sys
24*67e74705SXin Liimport os
25*67e74705SXin Liimport os.path
26*67e74705SXin Liimport re
27*67e74705SXin Liimport itertools
28*67e74705SXin Liimport json
29*67e74705SXin Liimport glob
30*67e74705SXin Liimport argparse
31*67e74705SXin Liimport logging
32*67e74705SXin Liimport subprocess
33*67e74705SXin Lifrom libear import build_libear, TemporaryDirectory
34*67e74705SXin Lifrom libscanbuild import command_entry_point
35*67e74705SXin Lifrom libscanbuild import duplicate_check, tempdir, initialize_logging
36*67e74705SXin Lifrom libscanbuild.compilation import split_command
37*67e74705SXin Lifrom libscanbuild.shell import encode, decode
38*67e74705SXin Li
39*67e74705SXin Li__all__ = ['capture', 'intercept_build_main', 'intercept_build_wrapper']
40*67e74705SXin Li
41*67e74705SXin LiGS = chr(0x1d)
42*67e74705SXin LiRS = chr(0x1e)
43*67e74705SXin LiUS = chr(0x1f)
44*67e74705SXin Li
45*67e74705SXin LiCOMPILER_WRAPPER_CC = 'intercept-cc'
46*67e74705SXin LiCOMPILER_WRAPPER_CXX = 'intercept-c++'
47*67e74705SXin Li
48*67e74705SXin Li
49*67e74705SXin Li@command_entry_point
50*67e74705SXin Lidef intercept_build_main(bin_dir):
51*67e74705SXin Li    """ Entry point for 'intercept-build' command. """
52*67e74705SXin Li
53*67e74705SXin Li    parser = create_parser()
54*67e74705SXin Li    args = parser.parse_args()
55*67e74705SXin Li
56*67e74705SXin Li    initialize_logging(args.verbose)
57*67e74705SXin Li    logging.debug('Parsed arguments: %s', args)
58*67e74705SXin Li
59*67e74705SXin Li    if not args.build:
60*67e74705SXin Li        parser.print_help()
61*67e74705SXin Li        return 0
62*67e74705SXin Li
63*67e74705SXin Li    return capture(args, bin_dir)
64*67e74705SXin Li
65*67e74705SXin Li
66*67e74705SXin Lidef capture(args, bin_dir):
67*67e74705SXin Li    """ The entry point of build command interception. """
68*67e74705SXin Li
69*67e74705SXin Li    def post_processing(commands):
70*67e74705SXin Li        """ To make a compilation database, it needs to filter out commands
71*67e74705SXin Li        which are not compiler calls. Needs to find the source file name
72*67e74705SXin Li        from the arguments. And do shell escaping on the command.
73*67e74705SXin Li
74*67e74705SXin Li        To support incremental builds, it is desired to read elements from
75*67e74705SXin Li        an existing compilation database from a previous run. These elements
76*67e74705SXin Li        shall be merged with the new elements. """
77*67e74705SXin Li
78*67e74705SXin Li        # create entries from the current run
79*67e74705SXin Li        current = itertools.chain.from_iterable(
80*67e74705SXin Li            # creates a sequence of entry generators from an exec,
81*67e74705SXin Li            format_entry(command) for command in commands)
82*67e74705SXin Li        # read entries from previous run
83*67e74705SXin Li        if 'append' in args and args.append and os.path.isfile(args.cdb):
84*67e74705SXin Li            with open(args.cdb) as handle:
85*67e74705SXin Li                previous = iter(json.load(handle))
86*67e74705SXin Li        else:
87*67e74705SXin Li            previous = iter([])
88*67e74705SXin Li        # filter out duplicate entries from both
89*67e74705SXin Li        duplicate = duplicate_check(entry_hash)
90*67e74705SXin Li        return (entry
91*67e74705SXin Li                for entry in itertools.chain(previous, current)
92*67e74705SXin Li                if os.path.exists(entry['file']) and not duplicate(entry))
93*67e74705SXin Li
94*67e74705SXin Li    with TemporaryDirectory(prefix='intercept-', dir=tempdir()) as tmp_dir:
95*67e74705SXin Li        # run the build command
96*67e74705SXin Li        environment = setup_environment(args, tmp_dir, bin_dir)
97*67e74705SXin Li        logging.debug('run build in environment: %s', environment)
98*67e74705SXin Li        exit_code = subprocess.call(args.build, env=environment)
99*67e74705SXin Li        logging.info('build finished with exit code: %d', exit_code)
100*67e74705SXin Li        # read the intercepted exec calls
101*67e74705SXin Li        exec_traces = itertools.chain.from_iterable(
102*67e74705SXin Li            parse_exec_trace(os.path.join(tmp_dir, filename))
103*67e74705SXin Li            for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd'))))
104*67e74705SXin Li        # do post processing only if that was requested
105*67e74705SXin Li        if 'raw_entries' not in args or not args.raw_entries:
106*67e74705SXin Li            entries = post_processing(exec_traces)
107*67e74705SXin Li        else:
108*67e74705SXin Li            entries = exec_traces
109*67e74705SXin Li        # dump the compilation database
110*67e74705SXin Li        with open(args.cdb, 'w+') as handle:
111*67e74705SXin Li            json.dump(list(entries), handle, sort_keys=True, indent=4)
112*67e74705SXin Li        return exit_code
113*67e74705SXin Li
114*67e74705SXin Li
115*67e74705SXin Lidef setup_environment(args, destination, bin_dir):
116*67e74705SXin Li    """ Sets up the environment for the build command.
117*67e74705SXin Li
118*67e74705SXin Li    It sets the required environment variables and execute the given command.
119*67e74705SXin Li    The exec calls will be logged by the 'libear' preloaded library or by the
120*67e74705SXin Li    'wrapper' programs. """
121*67e74705SXin Li
122*67e74705SXin Li    c_compiler = args.cc if 'cc' in args else 'cc'
123*67e74705SXin Li    cxx_compiler = args.cxx if 'cxx' in args else 'c++'
124*67e74705SXin Li
125*67e74705SXin Li    libear_path = None if args.override_compiler or is_preload_disabled(
126*67e74705SXin Li        sys.platform) else build_libear(c_compiler, destination)
127*67e74705SXin Li
128*67e74705SXin Li    environment = dict(os.environ)
129*67e74705SXin Li    environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination})
130*67e74705SXin Li
131*67e74705SXin Li    if not libear_path:
132*67e74705SXin Li        logging.debug('intercept gonna use compiler wrappers')
133*67e74705SXin Li        environment.update({
134*67e74705SXin Li            'CC': os.path.join(bin_dir, COMPILER_WRAPPER_CC),
135*67e74705SXin Li            'CXX': os.path.join(bin_dir, COMPILER_WRAPPER_CXX),
136*67e74705SXin Li            'INTERCEPT_BUILD_CC': c_compiler,
137*67e74705SXin Li            'INTERCEPT_BUILD_CXX': cxx_compiler,
138*67e74705SXin Li            'INTERCEPT_BUILD_VERBOSE': 'DEBUG' if args.verbose > 2 else 'INFO'
139*67e74705SXin Li        })
140*67e74705SXin Li    elif sys.platform == 'darwin':
141*67e74705SXin Li        logging.debug('intercept gonna preload libear on OSX')
142*67e74705SXin Li        environment.update({
143*67e74705SXin Li            'DYLD_INSERT_LIBRARIES': libear_path,
144*67e74705SXin Li            'DYLD_FORCE_FLAT_NAMESPACE': '1'
145*67e74705SXin Li        })
146*67e74705SXin Li    else:
147*67e74705SXin Li        logging.debug('intercept gonna preload libear on UNIX')
148*67e74705SXin Li        environment.update({'LD_PRELOAD': libear_path})
149*67e74705SXin Li
150*67e74705SXin Li    return environment
151*67e74705SXin Li
152*67e74705SXin Li
153*67e74705SXin Lidef intercept_build_wrapper(cplusplus):
154*67e74705SXin Li    """ Entry point for `intercept-cc` and `intercept-c++` compiler wrappers.
155*67e74705SXin Li
156*67e74705SXin Li    It does generate execution report into target directory. And execute
157*67e74705SXin Li    the wrapped compilation with the real compiler. The parameters for
158*67e74705SXin Li    report and execution are from environment variables.
159*67e74705SXin Li
160*67e74705SXin Li    Those parameters which for 'libear' library can't have meaningful
161*67e74705SXin Li    values are faked. """
162*67e74705SXin Li
163*67e74705SXin Li    # initialize wrapper logging
164*67e74705SXin Li    logging.basicConfig(format='intercept: %(levelname)s: %(message)s',
165*67e74705SXin Li                        level=os.getenv('INTERCEPT_BUILD_VERBOSE', 'INFO'))
166*67e74705SXin Li    # write report
167*67e74705SXin Li    try:
168*67e74705SXin Li        target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR')
169*67e74705SXin Li        if not target_dir:
170*67e74705SXin Li            raise UserWarning('exec report target directory not found')
171*67e74705SXin Li        pid = str(os.getpid())
172*67e74705SXin Li        target_file = os.path.join(target_dir, pid + '.cmd')
173*67e74705SXin Li        logging.debug('writing exec report to: %s', target_file)
174*67e74705SXin Li        with open(target_file, 'ab') as handler:
175*67e74705SXin Li            working_dir = os.getcwd()
176*67e74705SXin Li            command = US.join(sys.argv) + US
177*67e74705SXin Li            content = RS.join([pid, pid, 'wrapper', working_dir, command]) + GS
178*67e74705SXin Li            handler.write(content.encode('utf-8'))
179*67e74705SXin Li    except IOError:
180*67e74705SXin Li        logging.exception('writing exec report failed')
181*67e74705SXin Li    except UserWarning as warning:
182*67e74705SXin Li        logging.warning(warning)
183*67e74705SXin Li    # execute with real compiler
184*67e74705SXin Li    compiler = os.getenv('INTERCEPT_BUILD_CXX', 'c++') if cplusplus \
185*67e74705SXin Li        else os.getenv('INTERCEPT_BUILD_CC', 'cc')
186*67e74705SXin Li    compilation = [compiler] + sys.argv[1:]
187*67e74705SXin Li    logging.debug('execute compiler: %s', compilation)
188*67e74705SXin Li    return subprocess.call(compilation)
189*67e74705SXin Li
190*67e74705SXin Li
191*67e74705SXin Lidef parse_exec_trace(filename):
192*67e74705SXin Li    """ Parse the file generated by the 'libear' preloaded library.
193*67e74705SXin Li
194*67e74705SXin Li    Given filename points to a file which contains the basic report
195*67e74705SXin Li    generated by the interception library or wrapper command. A single
196*67e74705SXin Li    report file _might_ contain multiple process creation info. """
197*67e74705SXin Li
198*67e74705SXin Li    logging.debug('parse exec trace file: %s', filename)
199*67e74705SXin Li    with open(filename, 'r') as handler:
200*67e74705SXin Li        content = handler.read()
201*67e74705SXin Li        for group in filter(bool, content.split(GS)):
202*67e74705SXin Li            records = group.split(RS)
203*67e74705SXin Li            yield {
204*67e74705SXin Li                'pid': records[0],
205*67e74705SXin Li                'ppid': records[1],
206*67e74705SXin Li                'function': records[2],
207*67e74705SXin Li                'directory': records[3],
208*67e74705SXin Li                'command': records[4].split(US)[:-1]
209*67e74705SXin Li            }
210*67e74705SXin Li
211*67e74705SXin Li
212*67e74705SXin Lidef format_entry(exec_trace):
213*67e74705SXin Li    """ Generate the desired fields for compilation database entries. """
214*67e74705SXin Li
215*67e74705SXin Li    def abspath(cwd, name):
216*67e74705SXin Li        """ Create normalized absolute path from input filename. """
217*67e74705SXin Li        fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
218*67e74705SXin Li        return os.path.normpath(fullname)
219*67e74705SXin Li
220*67e74705SXin Li    logging.debug('format this command: %s', exec_trace['command'])
221*67e74705SXin Li    compilation = split_command(exec_trace['command'])
222*67e74705SXin Li    if compilation:
223*67e74705SXin Li        for source in compilation.files:
224*67e74705SXin Li            compiler = 'c++' if compilation.compiler == 'c++' else 'cc'
225*67e74705SXin Li            command = [compiler, '-c'] + compilation.flags + [source]
226*67e74705SXin Li            logging.debug('formated as: %s', command)
227*67e74705SXin Li            yield {
228*67e74705SXin Li                'directory': exec_trace['directory'],
229*67e74705SXin Li                'command': encode(command),
230*67e74705SXin Li                'file': abspath(exec_trace['directory'], source)
231*67e74705SXin Li            }
232*67e74705SXin Li
233*67e74705SXin Li
234*67e74705SXin Lidef is_preload_disabled(platform):
235*67e74705SXin Li    """ Library-based interposition will fail silently if SIP is enabled,
236*67e74705SXin Li    so this should be detected. You can detect whether SIP is enabled on
237*67e74705SXin Li    Darwin by checking whether (1) there is a binary called 'csrutil' in
238*67e74705SXin Li    the path and, if so, (2) whether the output of executing 'csrutil status'
239*67e74705SXin Li    contains 'System Integrity Protection status: enabled'.
240*67e74705SXin Li
241*67e74705SXin Li    Same problem on linux when SELinux is enabled. The status query program
242*67e74705SXin Li    'sestatus' and the output when it's enabled 'SELinux status: enabled'. """
243*67e74705SXin Li
244*67e74705SXin Li    if platform == 'darwin':
245*67e74705SXin Li        pattern = re.compile(r'System Integrity Protection status:\s+enabled')
246*67e74705SXin Li        command = ['csrutil', 'status']
247*67e74705SXin Li    elif platform in {'linux', 'linux2'}:
248*67e74705SXin Li        pattern = re.compile(r'SELinux status:\s+enabled')
249*67e74705SXin Li        command = ['sestatus']
250*67e74705SXin Li    else:
251*67e74705SXin Li        return False
252*67e74705SXin Li
253*67e74705SXin Li    try:
254*67e74705SXin Li        lines = subprocess.check_output(command).decode('utf-8')
255*67e74705SXin Li        return any((pattern.match(line) for line in lines.splitlines()))
256*67e74705SXin Li    except:
257*67e74705SXin Li        return False
258*67e74705SXin Li
259*67e74705SXin Li
260*67e74705SXin Lidef entry_hash(entry):
261*67e74705SXin Li    """ Implement unique hash method for compilation database entries. """
262*67e74705SXin Li
263*67e74705SXin Li    # For faster lookup in set filename is reverted
264*67e74705SXin Li    filename = entry['file'][::-1]
265*67e74705SXin Li    # For faster lookup in set directory is reverted
266*67e74705SXin Li    directory = entry['directory'][::-1]
267*67e74705SXin Li    # On OS X the 'cc' and 'c++' compilers are wrappers for
268*67e74705SXin Li    # 'clang' therefore both call would be logged. To avoid
269*67e74705SXin Li    # this the hash does not contain the first word of the
270*67e74705SXin Li    # command.
271*67e74705SXin Li    command = ' '.join(decode(entry['command'])[1:])
272*67e74705SXin Li
273*67e74705SXin Li    return '<>'.join([filename, directory, command])
274*67e74705SXin Li
275*67e74705SXin Li
276*67e74705SXin Lidef create_parser():
277*67e74705SXin Li    """ Command line argument parser factory method. """
278*67e74705SXin Li
279*67e74705SXin Li    parser = argparse.ArgumentParser(
280*67e74705SXin Li        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
281*67e74705SXin Li
282*67e74705SXin Li    parser.add_argument(
283*67e74705SXin Li        '--verbose', '-v',
284*67e74705SXin Li        action='count',
285*67e74705SXin Li        default=0,
286*67e74705SXin Li        help="""Enable verbose output from '%(prog)s'. A second and third
287*67e74705SXin Li                flag increases verbosity.""")
288*67e74705SXin Li    parser.add_argument(
289*67e74705SXin Li        '--cdb',
290*67e74705SXin Li        metavar='<file>',
291*67e74705SXin Li        default="compile_commands.json",
292*67e74705SXin Li        help="""The JSON compilation database.""")
293*67e74705SXin Li    group = parser.add_mutually_exclusive_group()
294*67e74705SXin Li    group.add_argument(
295*67e74705SXin Li        '--append',
296*67e74705SXin Li        action='store_true',
297*67e74705SXin Li        help="""Append new entries to existing compilation database.""")
298*67e74705SXin Li    group.add_argument(
299*67e74705SXin Li        '--disable-filter', '-n',
300*67e74705SXin Li        dest='raw_entries',
301*67e74705SXin Li        action='store_true',
302*67e74705SXin Li        help="""Intercepted child process creation calls (exec calls) are all
303*67e74705SXin Li                logged to the output. The output is not a compilation database.
304*67e74705SXin Li                This flag is for debug purposes.""")
305*67e74705SXin Li
306*67e74705SXin Li    advanced = parser.add_argument_group('advanced options')
307*67e74705SXin Li    advanced.add_argument(
308*67e74705SXin Li        '--override-compiler',
309*67e74705SXin Li        action='store_true',
310*67e74705SXin Li        help="""Always resort to the compiler wrapper even when better
311*67e74705SXin Li                intercept methods are available.""")
312*67e74705SXin Li    advanced.add_argument(
313*67e74705SXin Li        '--use-cc',
314*67e74705SXin Li        metavar='<path>',
315*67e74705SXin Li        dest='cc',
316*67e74705SXin Li        default='cc',
317*67e74705SXin Li        help="""When '%(prog)s' analyzes a project by interposing a compiler
318*67e74705SXin Li                wrapper, which executes a real compiler for compilation and
319*67e74705SXin Li                do other tasks (record the compiler invocation). Because of
320*67e74705SXin Li                this interposing, '%(prog)s' does not know what compiler your
321*67e74705SXin Li                project normally uses. Instead, it simply overrides the CC
322*67e74705SXin Li                environment variable, and guesses your default compiler.
323*67e74705SXin Li
324*67e74705SXin Li                If you need '%(prog)s' to use a specific compiler for
325*67e74705SXin Li                *compilation* then you can use this option to specify a path
326*67e74705SXin Li                to that compiler.""")
327*67e74705SXin Li    advanced.add_argument(
328*67e74705SXin Li        '--use-c++',
329*67e74705SXin Li        metavar='<path>',
330*67e74705SXin Li        dest='cxx',
331*67e74705SXin Li        default='c++',
332*67e74705SXin Li        help="""This is the same as "--use-cc" but for C++ code.""")
333*67e74705SXin Li
334*67e74705SXin Li    parser.add_argument(
335*67e74705SXin Li        dest='build',
336*67e74705SXin Li        nargs=argparse.REMAINDER,
337*67e74705SXin Li        help="""Command to run.""")
338*67e74705SXin Li
339*67e74705SXin Li    return parser
340