1 | # -*- coding: utf-8 -*- |
2 | # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
3 | # See https://llvm.org/LICENSE.txt for license information. |
4 | # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
5 | """ This module is responsible to capture the compiler invocation of any |
6 | build process. The result of that should be a compilation database. |
7 | |
8 | This implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES |
9 | mechanisms provided by the dynamic linker. The related library is implemented |
10 | in C language and can be found under 'libear' directory. |
11 | |
12 | The 'libear' library is capturing all child process creation and logging the |
13 | relevant information about it into separate files in a specified directory. |
14 | The parameter of this process is the output directory name, where the report |
15 | files shall be placed. This parameter is passed as an environment variable. |
16 | |
17 | The module also implements compiler wrappers to intercept the compiler calls. |
18 | |
19 | The module implements the build command execution and the post-processing of |
20 | the output files, which will condensates into a compilation database. """ |
21 | |
22 | import sys |
23 | import os |
24 | import os.path |
25 | import re |
26 | import itertools |
27 | import json |
28 | import glob |
29 | import logging |
30 | from libear import build_libear, TemporaryDirectory |
31 | from libscanbuild import command_entry_point, compiler_wrapper, \ |
32 | wrapper_environment, run_command, run_build |
33 | from libscanbuild import duplicate_check |
34 | from libscanbuild.compilation import split_command |
35 | from libscanbuild.arguments import parse_args_for_intercept_build |
36 | from libscanbuild.shell import encode, decode |
37 | |
38 | __all__ = ['capture', 'intercept_build', 'intercept_compiler_wrapper'] |
39 | |
40 | GS = chr(0x1d) |
41 | RS = chr(0x1e) |
42 | US = chr(0x1f) |
43 | |
44 | COMPILER_WRAPPER_CC = 'intercept-cc' |
45 | COMPILER_WRAPPER_CXX = 'intercept-c++' |
46 | TRACE_FILE_EXTENSION = '.cmd' # same as in ear.c |
47 | WRAPPER_ONLY_PLATFORMS = frozenset({'win32', 'cygwin'}) |
48 | |
49 | |
50 | @command_entry_point |
51 | def intercept_build(): |
52 | """ Entry point for 'intercept-build' command. """ |
53 | |
54 | args = parse_args_for_intercept_build() |
55 | return capture(args) |
56 | |
57 | |
58 | def capture(args): |
59 | """ The entry point of build command interception. """ |
60 | |
61 | def post_processing(commands): |
62 | """ To make a compilation database, it needs to filter out commands |
63 | which are not compiler calls. Needs to find the source file name |
64 | from the arguments. And do shell escaping on the command. |
65 | |
66 | To support incremental builds, it is desired to read elements from |
67 | an existing compilation database from a previous run. These elements |
68 | shall be merged with the new elements. """ |
69 | |
70 | # create entries from the current run |
71 | current = itertools.chain.from_iterable( |
72 | # creates a sequence of entry generators from an exec, |
73 | format_entry(command) for command in commands) |
74 | # read entries from previous run |
75 | if 'append' in args and args.append and os.path.isfile(args.cdb): |
76 | with open(args.cdb) as handle: |
77 | previous = iter(json.load(handle)) |
78 | else: |
79 | previous = iter([]) |
80 | # filter out duplicate entries from both |
81 | duplicate = duplicate_check(entry_hash) |
82 | return (entry |
83 | for entry in itertools.chain(previous, current) |
84 | if os.path.exists(entry['file']) and not duplicate(entry)) |
85 | |
86 | with TemporaryDirectory(prefix='intercept-') as tmp_dir: |
87 | # run the build command |
88 | environment = setup_environment(args, tmp_dir) |
89 | exit_code = run_build(args.build, env=environment) |
90 | # read the intercepted exec calls |
91 | exec_traces = itertools.chain.from_iterable( |
92 | parse_exec_trace(os.path.join(tmp_dir, filename)) |
93 | for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd')))) |
94 | # do post processing |
95 | entries = post_processing(exec_traces) |
96 | # dump the compilation database |
97 | with open(args.cdb, 'w+') as handle: |
98 | json.dump(list(entries), handle, sort_keys=True, indent=4) |
99 | return exit_code |
100 | |
101 | |
102 | def setup_environment(args, destination): |
103 | """ Sets up the environment for the build command. |
104 | |
105 | It sets the required environment variables and execute the given command. |
106 | The exec calls will be logged by the 'libear' preloaded library or by the |
107 | 'wrapper' programs. """ |
108 | |
109 | c_compiler = args.cc if 'cc' in args else 'cc' |
110 | cxx_compiler = args.cxx if 'cxx' in args else 'c++' |
111 | |
112 | libear_path = None if args.override_compiler or is_preload_disabled( |
113 | sys.platform) else build_libear(c_compiler, destination) |
114 | |
115 | environment = dict(os.environ) |
116 | environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination}) |
117 | |
118 | if not libear_path: |
119 | logging.debug('intercept gonna use compiler wrappers') |
120 | environment.update(wrapper_environment(args)) |
121 | environment.update({ |
122 | 'CC': COMPILER_WRAPPER_CC, |
123 | 'CXX': COMPILER_WRAPPER_CXX |
124 | }) |
125 | elif sys.platform == 'darwin': |
126 | logging.debug('intercept gonna preload libear on OSX') |
127 | environment.update({ |
128 | 'DYLD_INSERT_LIBRARIES': libear_path, |
129 | 'DYLD_FORCE_FLAT_NAMESPACE': '1' |
130 | }) |
131 | else: |
132 | logging.debug('intercept gonna preload libear on UNIX') |
133 | environment.update({'LD_PRELOAD': libear_path}) |
134 | |
135 | return environment |
136 | |
137 | |
138 | @command_entry_point |
139 | def intercept_compiler_wrapper(): |
140 | """ Entry point for `intercept-cc` and `intercept-c++`. """ |
141 | |
142 | return compiler_wrapper(intercept_compiler_wrapper_impl) |
143 | |
144 | |
145 | def intercept_compiler_wrapper_impl(_, execution): |
146 | """ Implement intercept compiler wrapper functionality. |
147 | |
148 | It does generate execution report into target directory. |
149 | The target directory name is from environment variables. """ |
150 | |
151 | message_prefix = 'execution report might be incomplete: %s' |
152 | |
153 | target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR') |
154 | if not target_dir: |
155 | logging.warning(message_prefix, 'missing target directory') |
156 | return |
157 | # write current execution info to the pid file |
158 | try: |
159 | target_file_name = str(os.getpid()) + TRACE_FILE_EXTENSION |
160 | target_file = os.path.join(target_dir, target_file_name) |
161 | logging.debug('writing execution report to: %s', target_file) |
162 | write_exec_trace(target_file, execution) |
163 | except IOError: |
164 | logging.warning(message_prefix, 'io problem') |
165 | |
166 | |
167 | def write_exec_trace(filename, entry): |
168 | """ Write execution report file. |
169 | |
170 | This method shall be sync with the execution report writer in interception |
171 | library. The entry in the file is a JSON objects. |
172 | |
173 | :param filename: path to the output execution trace file, |
174 | :param entry: the Execution object to append to that file. """ |
175 | |
176 | with open(filename, 'ab') as handler: |
177 | pid = str(entry.pid) |
178 | command = US.join(entry.cmd) + US |
179 | content = RS.join([pid, pid, 'wrapper', entry.cwd, command]) + GS |
180 | handler.write(content.encode('utf-8')) |
181 | |
182 | |
183 | def parse_exec_trace(filename): |
184 | """ Parse the file generated by the 'libear' preloaded library. |
185 | |
186 | Given filename points to a file which contains the basic report |
187 | generated by the interception library or wrapper command. A single |
188 | report file _might_ contain multiple process creation info. """ |
189 | |
190 | logging.debug('parse exec trace file: %s', filename) |
191 | with open(filename, 'r') as handler: |
192 | content = handler.read() |
193 | for group in filter(bool, content.split(GS)): |
194 | records = group.split(RS) |
195 | yield { |
196 | 'pid': records[0], |
197 | 'ppid': records[1], |
198 | 'function': records[2], |
199 | 'directory': records[3], |
200 | 'command': records[4].split(US)[:-1] |
201 | } |
202 | |
203 | |
204 | def format_entry(exec_trace): |
205 | """ Generate the desired fields for compilation database entries. """ |
206 | |
207 | def abspath(cwd, name): |
208 | """ Create normalized absolute path from input filename. """ |
209 | fullname = name if os.path.isabs(name) else os.path.join(cwd, name) |
210 | return os.path.normpath(fullname) |
211 | |
212 | logging.debug('format this command: %s', exec_trace['command']) |
213 | compilation = split_command(exec_trace['command']) |
214 | if compilation: |
215 | for source in compilation.files: |
216 | compiler = 'c++' if compilation.compiler == 'c++' else 'cc' |
217 | command = [compiler, '-c'] + compilation.flags + [source] |
218 | logging.debug('formated as: %s', command) |
219 | yield { |
220 | 'directory': exec_trace['directory'], |
221 | 'command': encode(command), |
222 | 'file': abspath(exec_trace['directory'], source) |
223 | } |
224 | |
225 | |
226 | def is_preload_disabled(platform): |
227 | """ Library-based interposition will fail silently if SIP is enabled, |
228 | so this should be detected. You can detect whether SIP is enabled on |
229 | Darwin by checking whether (1) there is a binary called 'csrutil' in |
230 | the path and, if so, (2) whether the output of executing 'csrutil status' |
231 | contains 'System Integrity Protection status: enabled'. |
232 | |
233 | :param platform: name of the platform (returned by sys.platform), |
234 | :return: True if library preload will fail by the dynamic linker. """ |
235 | |
236 | if platform in WRAPPER_ONLY_PLATFORMS: |
237 | return True |
238 | elif platform == 'darwin': |
239 | command = ['csrutil', 'status'] |
240 | pattern = re.compile(r'System Integrity Protection status:\s+enabled') |
241 | try: |
242 | return any(pattern.match(line) for line in run_command(command)) |
243 | except: |
244 | return False |
245 | else: |
246 | return False |
247 | |
248 | |
249 | def entry_hash(entry): |
250 | """ Implement unique hash method for compilation database entries. """ |
251 | |
252 | # For faster lookup in set filename is reverted |
253 | filename = entry['file'][::-1] |
254 | # For faster lookup in set directory is reverted |
255 | directory = entry['directory'][::-1] |
256 | # On OS X the 'cc' and 'c++' compilers are wrappers for |
257 | # 'clang' therefore both call would be logged. To avoid |
258 | # this the hash does not contain the first word of the |
259 | # command. |
260 | command = ' '.join(decode(entry['command'])[1:]) |
261 | |
262 | return '<>'.join([filename, directory, command]) |
263 | |