1"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20  - each line defines a rule; if the rule matches, the issue
21    is ignored.
22  - line number may be empty (that is, nothing between the
23    commas: ",,"). In this case, line numbers are ignored (the
24    rule matches anywhere in the file).
25  - the last field does not have to be a complete line; some
26    surrounding text (never more than a line) is enough for
27    context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
44import os
45import re
46import csv
47
48from docutils import nodes
49from sphinx.builders import Builder
50import sphinx.util
51
52detect_all = re.compile(r'''
53    ::(?=[^=])|            # two :: (but NOT ::=)
54    :[a-zA-Z][a-zA-Z0-9]+| # :foo
55    `|                     # ` (seldom used by itself)
56    (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:)
57    ''', re.VERBOSE).finditer
58
59
60class Rule:
61    def __init__(self, docname, lineno, issue, line):
62        """A rule for ignoring issues"""
63        self.docname = docname # document to which this rule applies
64        self.lineno = lineno   # line number in the original source;
65                               # this rule matches only near that.
66                               # None -> don't care
67        self.issue = issue     # the markup fragment that triggered this rule
68        self.line = line       # text of the container element (single line only)
69        self.used = False
70
71    def __repr__(self):
72        return '{0.docname},,{0.issue},{0.line}'.format(self)
73
74
75
76class dialect(csv.excel):
77    """Our dialect: uses only linefeed as newline."""
78    lineterminator = '\n'
79
80
81class CheckSuspiciousMarkupBuilder(Builder):
82    """
83    Checks for possibly invalid markup that may leak into the output.
84    """
85    name = 'suspicious'
86    logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")
87
88    def init(self):
89        # create output file
90        self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
91        open(self.log_file_name, 'w').close()
92        # load database of previously ignored issues
93        self.load_rules(os.path.join(os.path.dirname(__file__), '..',
94                                     'susp-ignored.csv'))
95
96    def get_outdated_docs(self):
97        return self.env.found_docs
98
99    def get_target_uri(self, docname, typ=None):
100        return ''
101
102    def prepare_writing(self, docnames):
103        pass
104
105    def write_doc(self, docname, doctree):
106        # set when any issue is encountered in this document
107        self.any_issue = False
108        self.docname = docname
109        visitor = SuspiciousVisitor(doctree, self)
110        doctree.walk(visitor)
111
112    def finish(self):
113        unused_rules = [rule for rule in self.rules if not rule.used]
114        if unused_rules:
115            self.logger.warning(
116                'Found %s/%s unused rules: %s' % (
117                    len(unused_rules), len(self.rules),
118                    '\n'.join(repr(rule) for rule in unused_rules),
119                )
120            )
121        return
122
123    def check_issue(self, line, lineno, issue):
124        if not self.is_ignored(line, lineno, issue):
125            self.report_issue(line, lineno, issue)
126
127    def is_ignored(self, line, lineno, issue):
128        """Determine whether this issue should be ignored."""
129        docname = self.docname
130        for rule in self.rules:
131            if rule.docname != docname: continue
132            if rule.issue != issue: continue
133            # Both lines must match *exactly*. This is rather strict,
134            # and probably should be improved.
135            # Doing fuzzy matches with levenshtein distance could work,
136            # but that means bringing other libraries...
137            # Ok, relax that requirement: just check if the rule fragment
138            # is contained in the document line
139            if rule.line not in line: continue
140            # Check both line numbers. If they're "near"
141            # this rule matches. (lineno=None means "don't care")
142            if (rule.lineno is not None) and \
143                abs(rule.lineno - lineno) > 5: continue
144            # if it came this far, the rule matched
145            rule.used = True
146            return True
147        return False
148
149    def report_issue(self, text, lineno, issue):
150        self.any_issue = True
151        self.write_log_entry(lineno, issue, text)
152        self.logger.warning('[%s:%d] "%s" found in "%-.120s"' %
153                                (self.docname, lineno, issue, text))
154        self.app.statuscode = 1
155
156    def write_log_entry(self, lineno, issue, text):
157        f = open(self.log_file_name, 'a')
158        writer = csv.writer(f, dialect)
159        writer.writerow([self.docname, lineno, issue, text.strip()])
160        f.close()
161
162    def load_rules(self, filename):
163        """Load database of previously ignored issues.
164
165        A csv file, with exactly the same format as suspicious.csv
166        Fields: document name (normalized), line number, issue, surrounding text
167        """
168        self.logger.info("loading ignore rules... ", nonl=1)
169        self.rules = rules = []
170        try:
171            f = open(filename, 'r')
172        except IOError:
173            return
174        for i, row in enumerate(csv.reader(f)):
175            if len(row) != 4:
176                raise ValueError(
177                    "wrong format in %s, line %d: %s" % (filename, i+1, row))
178            docname, lineno, issue, text = row
179            if lineno:
180                lineno = int(lineno)
181            else:
182                lineno = None
183            rule = Rule(docname, lineno, issue, text)
184            rules.append(rule)
185        f.close()
186        self.logger.info('done, %d rules loaded' % len(self.rules))
187
188
189def get_lineno(node):
190    """Obtain line number information for a node."""
191    lineno = None
192    while lineno is None and node:
193        node = node.parent
194        lineno = node.line
195    return lineno
196
197
198def extract_line(text, index):
199    """text may be a multiline string; extract
200    only the line containing the given character index.
201
202    >>> extract_line("abc\ndefgh\ni", 6)
203    >>> 'defgh'
204    >>> for i in (0, 2, 3, 4, 10):
205    ...   print extract_line("abc\ndefgh\ni", i)
206    abc
207    abc
208    abc
209    defgh
210    defgh
211    i
212    """
213    p = text.rfind('\n', 0, index) + 1
214    q = text.find('\n', index)
215    if q < 0:
216        q = len(text)
217    return text[p:q]
218
219
220class SuspiciousVisitor(nodes.GenericNodeVisitor):
221
222    lastlineno = 0
223
224    def __init__(self, document, builder):
225        nodes.GenericNodeVisitor.__init__(self, document)
226        self.builder = builder
227
228    def default_visit(self, node):
229        if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
230            text = node.astext()
231            # lineno seems to go backwards sometimes (?)
232            self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
233            seen = set() # don't report the same issue more than only once per line
234            for match in detect_all(text):
235                issue = match.group()
236                line = extract_line(text, match.start())
237                if (issue, line) not in seen:
238                    self.builder.check_issue(line, lineno, issue)
239                    seen.add((issue, line))
240
241    unknown_visit = default_visit
242
243    def visit_document(self, node):
244        self.lastlineno = 0
245
246    def visit_comment(self, node):
247        # ignore comments -- too much false positives.
248        # (although doing this could miss some errors;
249        # there were two sections "commented-out" by mistake
250        # in the Python docs that would not be caught)
251        raise nodes.SkipNode
252