utils/analyzer/CmpRuns.py

*67e74705SXin Li#!/usr/bin/env python
*67e74705SXin Li
*67e74705SXin Li"""
*67e74705SXin LiCmpRuns - A simple tool for comparing two static analyzer runs to determine
*67e74705SXin Liwhich reports have been added, removed, or changed.
*67e74705SXin Li
*67e74705SXin LiThis is designed to support automated testing using the static analyzer, from
*67e74705SXin Litwo perspectives:
*67e74705SXin Li  1. To monitor changes in the static analyzer's reports on real code bases, for
*67e74705SXin Li     regression testing.
*67e74705SXin Li
*67e74705SXin Li  2. For use by end users who want to integrate regular static analyzer testing
*67e74705SXin Li     into a buildbot like environment.
*67e74705SXin Li
*67e74705SXin LiUsage:
*67e74705SXin Li
*67e74705SXin Li    # Load the results of both runs, to obtain lists of the corresponding
*67e74705SXin Li    # AnalysisDiagnostic objects.
*67e74705SXin Li    #
*67e74705SXin Li    resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
*67e74705SXin Li    resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
*67e74705SXin Li
*67e74705SXin Li    # Generate a relation from diagnostics in run A to diagnostics in run B
*67e74705SXin Li    # to obtain a list of triples (a, b, confidence).
*67e74705SXin Li    diff = compareResults(resultsA, resultsB)
*67e74705SXin Li
*67e74705SXin Li"""
*67e74705SXin Li
*67e74705SXin Liimport os
*67e74705SXin Liimport plistlib
*67e74705SXin Liimport CmpRuns
*67e74705SXin Li
*67e74705SXin Li# Information about analysis run:
*67e74705SXin Li# path - the analysis output directory
*67e74705SXin Li# root - the name of the root directory, which will be disregarded when
*67e74705SXin Li# determining the source file name
*67e74705SXin Liclass SingleRunInfo:
*67e74705SXin Li    def __init__(self, path, root="", verboseLog=None):
*67e74705SXin Li        self.path = path
*67e74705SXin Li        self.root = root.rstrip("/\\")
*67e74705SXin Li        self.verboseLog = verboseLog
*67e74705SXin Li
*67e74705SXin Liclass AnalysisDiagnostic:
*67e74705SXin Li    def __init__(self, data, report, htmlReport):
*67e74705SXin Li        self._data = data
*67e74705SXin Li        self._loc = self._data['location']
*67e74705SXin Li        self._report = report
*67e74705SXin Li        self._htmlReport = htmlReport
*67e74705SXin Li
*67e74705SXin Li    def getFileName(self):
*67e74705SXin Li        root = self._report.run.root
*67e74705SXin Li        fileName = self._report.files[self._loc['file']]
*67e74705SXin Li        if fileName.startswith(root) and len(root) > 0:
*67e74705SXin Li            return fileName[len(root)+1:]
*67e74705SXin Li        return fileName
*67e74705SXin Li
*67e74705SXin Li    def getLine(self):
*67e74705SXin Li        return self._loc['line']
*67e74705SXin Li
*67e74705SXin Li    def getColumn(self):
*67e74705SXin Li        return self._loc['col']
*67e74705SXin Li
*67e74705SXin Li    def getCategory(self):
*67e74705SXin Li        return self._data['category']
*67e74705SXin Li
*67e74705SXin Li    def getDescription(self):
*67e74705SXin Li        return self._data['description']
*67e74705SXin Li
*67e74705SXin Li    def getIssueIdentifier(self) :
*67e74705SXin Li        id = self.getFileName() + "+"
*67e74705SXin Li        if 'issue_context' in self._data :
*67e74705SXin Li          id += self._data['issue_context'] + "+"
*67e74705SXin Li        if 'issue_hash_content_of_line_in_context' in self._data :
*67e74705SXin Li          id += str(self._data['issue_hash_content_of_line_in_context'])
*67e74705SXin Li        return id
*67e74705SXin Li
*67e74705SXin Li    def getReport(self):
*67e74705SXin Li        if self._htmlReport is None:
*67e74705SXin Li            return " "
*67e74705SXin Li        return os.path.join(self._report.run.path, self._htmlReport)
*67e74705SXin Li
*67e74705SXin Li    def getReadableName(self):
*67e74705SXin Li        return '%s:%d:%d, %s: %s' % (self.getFileName(), self.getLine(),
*67e74705SXin Li                                     self.getColumn(), self.getCategory(),
*67e74705SXin Li                                     self.getDescription())
*67e74705SXin Li
*67e74705SXin Li    # Note, the data format is not an API and may change from one analyzer
*67e74705SXin Li    # version to another.
*67e74705SXin Li    def getRawData(self):
*67e74705SXin Li        return self._data
*67e74705SXin Li
*67e74705SXin Liclass multidict:
*67e74705SXin Li    def __init__(self, elts=()):
*67e74705SXin Li        self.data = {}
*67e74705SXin Li        for key,value in elts:
*67e74705SXin Li            self[key] = value
*67e74705SXin Li
*67e74705SXin Li    def __getitem__(self, item):
*67e74705SXin Li        return self.data[item]
*67e74705SXin Li    def __setitem__(self, key, value):
*67e74705SXin Li        if key in self.data:
*67e74705SXin Li            self.data[key].append(value)
*67e74705SXin Li        else:
*67e74705SXin Li            self.data[key] = [value]
*67e74705SXin Li    def items(self):
*67e74705SXin Li        return self.data.items()
*67e74705SXin Li    def values(self):
*67e74705SXin Li        return self.data.values()
*67e74705SXin Li    def keys(self):
*67e74705SXin Li        return self.data.keys()
*67e74705SXin Li    def __len__(self):
*67e74705SXin Li        return len(self.data)
*67e74705SXin Li    def get(self, key, default=None):
*67e74705SXin Li        return self.data.get(key, default)
*67e74705SXin Li
*67e74705SXin Liclass CmpOptions:
*67e74705SXin Li    def __init__(self, verboseLog=None, rootA="", rootB=""):
*67e74705SXin Li        self.rootA = rootA
*67e74705SXin Li        self.rootB = rootB
*67e74705SXin Li        self.verboseLog = verboseLog
*67e74705SXin Li
*67e74705SXin Liclass AnalysisReport:
*67e74705SXin Li    def __init__(self, run, files):
*67e74705SXin Li        self.run = run
*67e74705SXin Li        self.files = files
*67e74705SXin Li        self.diagnostics = []
*67e74705SXin Li
*67e74705SXin Liclass AnalysisRun:
*67e74705SXin Li    def __init__(self, info):
*67e74705SXin Li        self.path = info.path
*67e74705SXin Li        self.root = info.root
*67e74705SXin Li        self.info = info
*67e74705SXin Li        self.reports = []
*67e74705SXin Li        # Cumulative list of all diagnostics from all the reports.
*67e74705SXin Li        self.diagnostics = []
*67e74705SXin Li        self.clang_version = None
*67e74705SXin Li
*67e74705SXin Li    def getClangVersion(self):
*67e74705SXin Li        return self.clang_version
*67e74705SXin Li
*67e74705SXin Li    def readSingleFile(self, p, deleteEmpty):
*67e74705SXin Li        data = plistlib.readPlist(p)
*67e74705SXin Li
*67e74705SXin Li        # We want to retrieve the clang version even if there are no
*67e74705SXin Li        # reports. Assume that all reports were created using the same
*67e74705SXin Li        # clang version (this is always true and is more efficient).
*67e74705SXin Li        if 'clang_version' in data:
*67e74705SXin Li            if self.clang_version == None:
*67e74705SXin Li                self.clang_version = data.pop('clang_version')
*67e74705SXin Li            else:
*67e74705SXin Li                data.pop('clang_version')
*67e74705SXin Li
*67e74705SXin Li        # Ignore/delete empty reports.
*67e74705SXin Li        if not data['files']:
*67e74705SXin Li            if deleteEmpty == True:
*67e74705SXin Li                os.remove(p)
*67e74705SXin Li            return
*67e74705SXin Li
*67e74705SXin Li        # Extract the HTML reports, if they exists.
*67e74705SXin Li        if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
*67e74705SXin Li            htmlFiles = []
*67e74705SXin Li            for d in data['diagnostics']:
*67e74705SXin Li                # FIXME: Why is this named files, when does it have multiple
*67e74705SXin Li                # files?
*67e74705SXin Li                assert len(d['HTMLDiagnostics_files']) == 1
*67e74705SXin Li                htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
*67e74705SXin Li        else:
*67e74705SXin Li            htmlFiles = [None] * len(data['diagnostics'])
*67e74705SXin Li
*67e74705SXin Li        report = AnalysisReport(self, data.pop('files'))
*67e74705SXin Li        diagnostics = [AnalysisDiagnostic(d, report, h)
*67e74705SXin Li                       for d,h in zip(data.pop('diagnostics'),
*67e74705SXin Li                                      htmlFiles)]
*67e74705SXin Li
*67e74705SXin Li        assert not data
*67e74705SXin Li
*67e74705SXin Li        report.diagnostics.extend(diagnostics)
*67e74705SXin Li        self.reports.append(report)
*67e74705SXin Li        self.diagnostics.extend(diagnostics)
*67e74705SXin Li
*67e74705SXin Li
*67e74705SXin Li# Backward compatibility API.
*67e74705SXin Lidef loadResults(path, opts, root = "", deleteEmpty=True):
*67e74705SXin Li    return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
*67e74705SXin Li                                    deleteEmpty)
*67e74705SXin Li
*67e74705SXin Li# Load results of the analyzes from a given output folder.
*67e74705SXin Li# - info is the SingleRunInfo object
*67e74705SXin Li# - deleteEmpty specifies if the empty plist files should be deleted
*67e74705SXin Lidef loadResultsFromSingleRun(info, deleteEmpty=True):
*67e74705SXin Li    path = info.path
*67e74705SXin Li    run = AnalysisRun(info)
*67e74705SXin Li
*67e74705SXin Li    if os.path.isfile(path):
*67e74705SXin Li        run.readSingleFile(path, deleteEmpty)
*67e74705SXin Li    else:
*67e74705SXin Li        for (dirpath, dirnames, filenames) in os.walk(path):
*67e74705SXin Li            for f in filenames:
*67e74705SXin Li                if (not f.endswith('plist')):
*67e74705SXin Li                    continue
*67e74705SXin Li                p = os.path.join(dirpath, f)
*67e74705SXin Li                run.readSingleFile(p, deleteEmpty)
*67e74705SXin Li
*67e74705SXin Li    return run
*67e74705SXin Li
*67e74705SXin Lidef cmpAnalysisDiagnostic(d) :
*67e74705SXin Li    return d.getIssueIdentifier()
*67e74705SXin Li
*67e74705SXin Lidef compareResults(A, B):
*67e74705SXin Li    """
*67e74705SXin Li    compareResults - Generate a relation from diagnostics in run A to
*67e74705SXin Li    diagnostics in run B.
*67e74705SXin Li
*67e74705SXin Li    The result is the relation as a list of triples (a, b, confidence) where
*67e74705SXin Li    each element {a,b} is None or an element from the respective run, and
*67e74705SXin Li    confidence is a measure of the match quality (where 0 indicates equality,
*67e74705SXin Li    and None is used if either element is None).
*67e74705SXin Li    """
*67e74705SXin Li
*67e74705SXin Li    res = []
*67e74705SXin Li
*67e74705SXin Li    # Quickly eliminate equal elements.
*67e74705SXin Li    neqA = []
*67e74705SXin Li    neqB = []
*67e74705SXin Li    eltsA = list(A.diagnostics)
*67e74705SXin Li    eltsB = list(B.diagnostics)
*67e74705SXin Li    eltsA.sort(key = cmpAnalysisDiagnostic)
*67e74705SXin Li    eltsB.sort(key = cmpAnalysisDiagnostic)
*67e74705SXin Li    while eltsA and eltsB:
*67e74705SXin Li        a = eltsA.pop()
*67e74705SXin Li        b = eltsB.pop()
*67e74705SXin Li        if (a.getIssueIdentifier() == b.getIssueIdentifier()) :
*67e74705SXin Li            res.append((a, b, 0))
*67e74705SXin Li        elif a.getIssueIdentifier() > b.getIssueIdentifier():
*67e74705SXin Li            eltsB.append(b)
*67e74705SXin Li            neqA.append(a)
*67e74705SXin Li        else:
*67e74705SXin Li            eltsA.append(a)
*67e74705SXin Li            neqB.append(b)
*67e74705SXin Li    neqA.extend(eltsA)
*67e74705SXin Li    neqB.extend(eltsB)
*67e74705SXin Li
*67e74705SXin Li    # FIXME: Add fuzzy matching. One simple and possible effective idea would be
*67e74705SXin Li    # to bin the diagnostics, print them in a normalized form (based solely on
*67e74705SXin Li    # the structure of the diagnostic), compute the diff, then use that as the
*67e74705SXin Li    # basis for matching. This has the nice property that we don't depend in any
*67e74705SXin Li    # way on the diagnostic format.
*67e74705SXin Li
*67e74705SXin Li    for a in neqA:
*67e74705SXin Li        res.append((a, None, None))
*67e74705SXin Li    for b in neqB:
*67e74705SXin Li        res.append((None, b, None))
*67e74705SXin Li
*67e74705SXin Li    return res
*67e74705SXin Li
*67e74705SXin Lidef dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True):
*67e74705SXin Li    # Load the run results.
*67e74705SXin Li    resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
*67e74705SXin Li    resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
*67e74705SXin Li
*67e74705SXin Li    # Open the verbose log, if given.
*67e74705SXin Li    if opts.verboseLog:
*67e74705SXin Li        auxLog = open(opts.verboseLog, "wb")
*67e74705SXin Li    else:
*67e74705SXin Li        auxLog = None
*67e74705SXin Li
*67e74705SXin Li    diff = compareResults(resultsA, resultsB)
*67e74705SXin Li    foundDiffs = 0
*67e74705SXin Li    for res in diff:
*67e74705SXin Li        a,b,confidence = res
*67e74705SXin Li        if a is None:
*67e74705SXin Li            print "ADDED: %r" % b.getReadableName()
*67e74705SXin Li            foundDiffs += 1
*67e74705SXin Li            if auxLog:
*67e74705SXin Li                print >>auxLog, ("('ADDED', %r, %r)" % (b.getReadableName(),
*67e74705SXin Li                                                        b.getReport()))
*67e74705SXin Li        elif b is None:
*67e74705SXin Li            print "REMOVED: %r" % a.getReadableName()
*67e74705SXin Li            foundDiffs += 1
*67e74705SXin Li            if auxLog:
*67e74705SXin Li                print >>auxLog, ("('REMOVED', %r, %r)" % (a.getReadableName(),
*67e74705SXin Li                                                          a.getReport()))
*67e74705SXin Li        elif confidence:
*67e74705SXin Li            print "CHANGED: %r to %r" % (a.getReadableName(),
*67e74705SXin Li                                         b.getReadableName())
*67e74705SXin Li            foundDiffs += 1
*67e74705SXin Li            if auxLog:
*67e74705SXin Li                print >>auxLog, ("('CHANGED', %r, %r, %r, %r)"
*67e74705SXin Li                                 % (a.getReadableName(),
*67e74705SXin Li                                    b.getReadableName(),
*67e74705SXin Li                                    a.getReport(),
*67e74705SXin Li                                    b.getReport()))
*67e74705SXin Li        else:
*67e74705SXin Li            pass
*67e74705SXin Li
*67e74705SXin Li    TotalReports = len(resultsB.diagnostics)
*67e74705SXin Li    print "TOTAL REPORTS: %r" % TotalReports
*67e74705SXin Li    print "TOTAL DIFFERENCES: %r" % foundDiffs
*67e74705SXin Li    if auxLog:
*67e74705SXin Li        print >>auxLog, "('TOTAL NEW REPORTS', %r)" % TotalReports
*67e74705SXin Li        print >>auxLog, "('TOTAL DIFFERENCES', %r)" % foundDiffs
*67e74705SXin Li
*67e74705SXin Li    return foundDiffs, len(resultsA.diagnostics), len(resultsB.diagnostics)
*67e74705SXin Li
*67e74705SXin Lidef main():
*67e74705SXin Li    from optparse import OptionParser
*67e74705SXin Li    parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
*67e74705SXin Li    parser.add_option("", "--rootA", dest="rootA",
*67e74705SXin Li                      help="Prefix to ignore on source files for directory A",
*67e74705SXin Li                      action="store", type=str, default="")
*67e74705SXin Li    parser.add_option("", "--rootB", dest="rootB",
*67e74705SXin Li                      help="Prefix to ignore on source files for directory B",
*67e74705SXin Li                      action="store", type=str, default="")
*67e74705SXin Li    parser.add_option("", "--verbose-log", dest="verboseLog",
*67e74705SXin Li                      help="Write additional information to LOG [default=None]",
*67e74705SXin Li                      action="store", type=str, default=None,
*67e74705SXin Li                      metavar="LOG")
*67e74705SXin Li    (opts, args) = parser.parse_args()
*67e74705SXin Li
*67e74705SXin Li    if len(args) != 2:
*67e74705SXin Li        parser.error("invalid number of arguments")
*67e74705SXin Li
*67e74705SXin Li    dirA,dirB = args
*67e74705SXin Li
*67e74705SXin Li    dumpScanBuildResultsDiff(dirA, dirB, opts)
*67e74705SXin Li
*67e74705SXin Liif __name__ == '__main__':
*67e74705SXin Li    main()