CmpRuns.py source code [clang_source_code/utils/analyzer/CmpRuns.py]

1	#!/usr/bin/env python
2
3	"""
4	CmpRuns - A simple tool for comparing two static analyzer runs to determine
5	which reports have been added, removed, or changed.
6
7	This is designed to support automated testing using the static analyzer, from
8	two perspectives:
9	1. To monitor changes in the static analyzer's reports on real code bases,
10	for regression testing.
11
12	2. For use by end users who want to integrate regular static analyzer testing
13	into a buildbot like environment.
14
15	Usage:
16
17	# Load the results of both runs, to obtain lists of the corresponding
18	# AnalysisDiagnostic objects.
19	#
20	resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
21	resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
22
23	# Generate a relation from diagnostics in run A to diagnostics in run B
24	# to obtain a list of triples (a, b, confidence).
25	diff = compareResults(resultsA, resultsB)
26
27	"""
28	from __future__ import division, print_function
29
30	from collections import defaultdict
31
32	from math import log
33	from optparse import OptionParser
34	import json
35	import os
36	import plistlib
37	import re
38	import sys
39
40	STATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE \| re.DOTALL)
41
42	class Colors(object):
43	"""
44	Color for terminal highlight.
45	"""
46	RED = '\x1b[2;30;41m'
47	GREEN = '\x1b[6;30;42m'
48	CLEAR = '\x1b[0m'
49
50	# Information about analysis run:
51	# path - the analysis output directory
52	# root - the name of the root directory, which will be disregarded when
53	# determining the source file name
54	class SingleRunInfo(object):
55	def __init__(self, path, root="", verboseLog=None):
56	self.path = path
57	self.root = root.rstrip("/\\")
58	self.verboseLog = verboseLog
59
60
61	class AnalysisDiagnostic(object):
62	def __init__(self, data, report, htmlReport):
63	self._data = data
64	self._loc = self._data['location']
65	self._report = report
66	self._htmlReport = htmlReport
67	self._reportSize = len(self._data['path'])
68
69	def getFileName(self):
70	root = self._report.run.root
71	fileName = self._report.files[self._loc['file']]
72	if fileName.startswith(root) and len(root) > 0:
73	return fileName[len(root) + 1:]
74	return fileName
75
76	def getRootFileName(self):
77	path = self._data['path']
78	if not path:
79	return self.getFileName()
80	p = path[0]
81	if 'location' in p:
82	fIdx = p['location']['file']
83	else: # control edge
84	fIdx = path[0]['edges'][0]['start'][0]['file']
85	out = self._report.files[fIdx]
86	root = self._report.run.root
87	if out.startswith(root):
88	return out[len(root):]
89	return out
90
91	def getLine(self):
92	return self._loc['line']
93
94	def getColumn(self):
95	return self._loc['col']
96
97	def getPathLength(self):
98	return self._reportSize
99
100	def getCategory(self):
101	return self._data['category']
102
103	def getDescription(self):
104	return self._data['description']
105
106	def getIssueIdentifier(self):
107	id = self.getFileName() + "+"
108	if 'issue_context' in self._data:
109	id += self._data['issue_context'] + "+"
110	if 'issue_hash_content_of_line_in_context' in self._data:
111	id += str(self._data['issue_hash_content_of_line_in_context'])
112	return id
113
114	def getReport(self):
115	if self._htmlReport is None:
116	return " "
117	return os.path.join(self._report.run.path, self._htmlReport)
118
119	def getReadableName(self):
120	if 'issue_context' in self._data:
121	funcnamePostfix = "#" + self._data['issue_context']
122	else:
123	funcnamePostfix = ""
124	rootFilename = self.getRootFileName()
125	fileName = self.getFileName()
126	if rootFilename != fileName:
127	filePrefix = "[%s] %s" % (rootFilename, fileName)
128	else:
129	filePrefix = rootFilename
130	return '%s%s:%d:%d, %s: %s' % (filePrefix,
131	funcnamePostfix,
132	self.getLine(),
133	self.getColumn(), self.getCategory(),
134	self.getDescription())
135
136	# Note, the data format is not an API and may change from one analyzer
137	# version to another.
138	def getRawData(self):
139	return self._data
140
141
142	class AnalysisReport(object):
143	def __init__(self, run, files):
144	self.run = run
145	self.files = files
146	self.diagnostics = []
147
148
149	class AnalysisRun(object):
150	def __init__(self, info):
151	self.path = info.path
152	self.root = info.root
153	self.info = info
154	self.reports = []
155	# Cumulative list of all diagnostics from all the reports.
156	self.diagnostics = []
157	self.clang_version = None
158	self.stats = []
159
160	def getClangVersion(self):
161	return self.clang_version
162
163	def readSingleFile(self, p, deleteEmpty):
164	data = plistlib.readPlist(p)
165	if 'statistics' in data:
166	self.stats.append(json.loads(data['statistics']))
167	data.pop('statistics')
168
169	# We want to retrieve the clang version even if there are no
170	# reports. Assume that all reports were created using the same
171	# clang version (this is always true and is more efficient).
172	if 'clang_version' in data:
173	if self.clang_version is None:
174	self.clang_version = data.pop('clang_version')
175	else:
176	data.pop('clang_version')
177
178	# Ignore/delete empty reports.
179	if not data['files']:
180	if deleteEmpty:
181	os.remove(p)
182	return
183
184	# Extract the HTML reports, if they exists.
185	if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
186	htmlFiles = []
187	for d in data['diagnostics']:
188	# FIXME: Why is this named files, when does it have multiple
189	# files?
190	assert len(d['HTMLDiagnostics_files']) == 1
191	htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
192	else:
193	htmlFiles = [None] * len(data['diagnostics'])
194
195	report = AnalysisReport(self, data.pop('files'))
196	diagnostics = [AnalysisDiagnostic(d, report, h)
197	for d, h in zip(data.pop('diagnostics'), htmlFiles)]
198
199	assert not data
200
201	report.diagnostics.extend(diagnostics)
202	self.reports.append(report)
203	self.diagnostics.extend(diagnostics)
204
205
206	def loadResults(path, opts, root="", deleteEmpty=True):
207	"""
208	Backwards compatibility API.
209	"""
210	return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
211	deleteEmpty)
212
213
214	def loadResultsFromSingleRun(info, deleteEmpty=True):
215	"""
216	# Load results of the analyzes from a given output folder.
217	# - info is the SingleRunInfo object
218	# - deleteEmpty specifies if the empty plist files should be deleted
219
220	"""
221	path = info.path
222	run = AnalysisRun(info)
223
224	if os.path.isfile(path):
225	run.readSingleFile(path, deleteEmpty)
226	else:
227	for (dirpath, dirnames, filenames) in os.walk(path):
228	for f in filenames:
229	if (not f.endswith('plist')):
230	continue
231	p = os.path.join(dirpath, f)
232	run.readSingleFile(p, deleteEmpty)
233
234	return run
235
236
237	def cmpAnalysisDiagnostic(d):
238	return d.getIssueIdentifier()
239
240
241	def compareResults(A, B, opts):
242	"""
243	compareResults - Generate a relation from diagnostics in run A to
244	diagnostics in run B.
245
246	The result is the relation as a list of triples (a, b) where
247	each element {a,b} is None or a matching element from the respective run
248	"""
249
250	res = []
251
252	# Map size_before -> size_after
253	path_difference_data = []
254
255	# Quickly eliminate equal elements.
256	neqA = []
257	neqB = []
258	eltsA = list(A.diagnostics)
259	eltsB = list(B.diagnostics)
260	eltsA.sort(key=cmpAnalysisDiagnostic)
261	eltsB.sort(key=cmpAnalysisDiagnostic)
262	while eltsA and eltsB:
263	a = eltsA.pop()
264	b = eltsB.pop()
265	if (a.getIssueIdentifier() == b.getIssueIdentifier()):
266	if a.getPathLength() != b.getPathLength():
267	if opts.relative_path_histogram:
268	path_difference_data.append(
269	float(a.getPathLength()) / b.getPathLength())
270	elif opts.relative_log_path_histogram:
271	path_difference_data.append(
272	log(float(a.getPathLength()) / b.getPathLength()))
273	elif opts.absolute_path_histogram:
274	path_difference_data.append(
275	a.getPathLength() - b.getPathLength())
276
277	res.append((a, b))
278	elif a.getIssueIdentifier() > b.getIssueIdentifier():
279	eltsB.append(b)
280	neqA.append(a)
281	else:
282	eltsA.append(a)
283	neqB.append(b)
284	neqA.extend(eltsA)
285	neqB.extend(eltsB)
286
287	# FIXME: Add fuzzy matching. One simple and possible effective idea would
288	# be to bin the diagnostics, print them in a normalized form (based solely
289	# on the structure of the diagnostic), compute the diff, then use that as
290	# the basis for matching. This has the nice property that we don't depend
291	# in any way on the diagnostic format.
292
293	for a in neqA:
294	res.append((a, None))
295	for b in neqB:
296	res.append((None, b))
297
298	if opts.relative_log_path_histogram or opts.relative_path_histogram or \
299	opts.absolute_path_histogram:
300	from matplotlib import pyplot
301	pyplot.hist(path_difference_data, bins=100)
302	pyplot.show()
303
304	return res
305
306	def computePercentile(l, percentile):
307	"""
308	Return computed percentile.
309	"""
310	return sorted(l)[int(round(percentile * len(l) + 0.5)) - 1]
311
312	def deriveStats(results):
313	# Assume all keys are the same in each statistics bucket.
314	combined_data = defaultdict(list)
315
316	# Collect data on paths length.
317	for report in results.reports:
318	for diagnostic in report.diagnostics:
319	combined_data['PathsLength'].append(diagnostic.getPathLength())
320
321	for stat in results.stats:
322	for key, value in stat.items():
323	combined_data[key].append(value)
324	combined_stats = {}
325	for key, values in combined_data.items():
326	combined_stats[str(key)] = {
327	"max": max(values),
328	"min": min(values),
329	"mean": sum(values) / len(values),
330	"90th %tile": computePercentile(values, 0.9),
331	"95th %tile": computePercentile(values, 0.95),
332	"median": sorted(values)[len(values) // 2],
333	"total": sum(values)
334	}
335	return combined_stats
336
337
338	def compareStats(resultsA, resultsB):
339	statsA = deriveStats(resultsA)
340	statsB = deriveStats(resultsB)
341	keys = sorted(statsA.keys())
342	for key in keys:
343	print(key)
344	for kkey in statsA[key]:
345	valA = float(statsA[key][kkey])
346	valB = float(statsB[key][kkey])
347	report = "%.3f -> %.3f" % (valA, valB)
348	# Only apply highlighting when writing to TTY and it's not Windows
349	if sys.stdout.isatty() and os.name != 'nt':
350	if valB != 0:
351	ratio = (valB - valA) / valB
352	if ratio < -0.2:
353	report = Colors.GREEN + report + Colors.CLEAR
354	elif ratio > 0.2:
355	report = Colors.RED + report + Colors.CLEAR
356	print("\t %s %s" % (kkey, report))
357
358	def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True,
359	Stdout=sys.stdout):
360	# Load the run results.
361	resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
362	resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
363	if opts.show_stats:
364	compareStats(resultsA, resultsB)
365	if opts.stats_only:
366	return
367
368	# Open the verbose log, if given.
369	if opts.verboseLog:
370	auxLog = open(opts.verboseLog, "wb")
371	else:
372	auxLog = None
373
374	diff = compareResults(resultsA, resultsB, opts)
375	foundDiffs = 0
376	totalAdded = 0
377	totalRemoved = 0
378	for res in diff:
379	a, b = res
380	if a is None:
381	Stdout.write("ADDED: %r\n" % b.getReadableName())
382	foundDiffs += 1
383	totalAdded += 1
384	if auxLog:
385	auxLog.write("('ADDED', %r, %r)\n" % (b.getReadableName(),
386	b.getReport()))
387	elif b is None:
388	Stdout.write("REMOVED: %r\n" % a.getReadableName())
389	foundDiffs += 1
390	totalRemoved += 1
391	if auxLog:
392	auxLog.write("('REMOVED', %r, %r)\n" % (a.getReadableName(),
393	a.getReport()))
394	else:
395	pass
396
397	TotalReports = len(resultsB.diagnostics)
398	Stdout.write("TOTAL REPORTS: %r\n" % TotalReports)
399	Stdout.write("TOTAL ADDED: %r\n" % totalAdded)
400	Stdout.write("TOTAL REMOVED: %r\n" % totalRemoved)
401	if auxLog:
402	auxLog.write("('TOTAL NEW REPORTS', %r)\n" % TotalReports)
403	auxLog.write("('TOTAL DIFFERENCES', %r)\n" % foundDiffs)
404	auxLog.close()
405
406	return foundDiffs, len(resultsA.diagnostics), len(resultsB.diagnostics)
407
408	def generate_option_parser():
409	parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
410	parser.add_option("", "--rootA", dest="rootA",
411	help="Prefix to ignore on source files for directory A",
412	action="store", type=str, default="")
413	parser.add_option("", "--rootB", dest="rootB",
414	help="Prefix to ignore on source files for directory B",
415	action="store", type=str, default="")
416	parser.add_option("", "--verbose-log", dest="verboseLog",
417	help="Write additional information to LOG \
418	[default=None]",
419	action="store", type=str, default=None,
420	metavar="LOG")
421	parser.add_option("--relative-path-differences-histogram",
422	action="store_true", dest="relative_path_histogram",
423	default=False,
424	help="Show histogram of relative paths differences. \
425	Requires matplotlib")
426	parser.add_option("--relative-log-path-differences-histogram",
427	action="store_true", dest="relative_log_path_histogram",
428	default=False,
429	help="Show histogram of log relative paths differences. \
430	Requires matplotlib")
431	parser.add_option("--absolute-path-differences-histogram",
432	action="store_true", dest="absolute_path_histogram",
433	default=False,
434	help="Show histogram of absolute paths differences. \
435	Requires matplotlib")
436	parser.add_option("--stats-only", action="store_true", dest="stats_only",
437	default=False, help="Only show statistics on reports")
438	parser.add_option("--show-stats", action="store_true", dest="show_stats",
439	default=False, help="Show change in statistics")
440	return parser
441
442
443	def main():
444	parser = generate_option_parser()
445	(opts, args) = parser.parse_args()
446
447	if len(args) != 2:
448	parser.error("invalid number of arguments")
449
450	dirA, dirB = args
451
452	dumpScanBuildResultsDiff(dirA, dirB, opts)
453
454
455	if __name__ == '__main__':
456	main()
457

Clang Project