1""" 2Try to detect suspicious constructs, resembling markup 3that has leaked into the final output. 4 5Suspicious lines are reported in a comma-separated-file, 6``suspicious.csv``, located in the output directory. 7 8The file is utf-8 encoded, and each line contains four fields: 9 10 * document name (normalized) 11 * line number in the source document 12 * problematic text 13 * complete line showing the problematic text in context 14 15It is common to find many false positives. To avoid reporting them 16again and again, they may be added to the ``ignored.csv`` file 17(located in the configuration directory). The file has the same 18format as ``suspicious.csv`` with a few differences: 19 20 - each line defines a rule; if the rule matches, the issue 21 is ignored. 22 - line number may be empty (that is, nothing between the 23 commas: ",,"). In this case, line numbers are ignored (the 24 rule matches anywhere in the file). 25 - the last field does not have to be a complete line; some 26 surrounding text (never more than a line) is enough for 27 context. 28 29Rules are processed sequentially. A rule matches when: 30 31 * document names are the same 32 * problematic texts are the same 33 * line numbers are close to each other (5 lines up or down) 34 * the rule text is completely contained into the source line 35 36The simplest way to create the ignored.csv file is by copying 37undesired entries from suspicious.csv (possibly trimming the last 38field.) 39 40Copyright 2009 Gabriel A. Genellina 41 42""" 43 44import os 45import re 46import csv 47 48from docutils import nodes 49from sphinx.builders import Builder 50import sphinx.util 51 52detect_all = re.compile(r''' 53 ::(?=[^=])| # two :: (but NOT ::=) 54 :[a-zA-Z][a-zA-Z0-9]+| # :foo 55 `| # ` (seldom used by itself) 56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:) 57 ''', re.VERBOSE).finditer 58 59 60class Rule: 61 def __init__(self, docname, lineno, issue, line): 62 """A rule for ignoring issues""" 63 self.docname = docname # document to which this rule applies 64 self.lineno = lineno # line number in the original source; 65 # this rule matches only near that. 66 # None -> don't care 67 self.issue = issue # the markup fragment that triggered this rule 68 self.line = line # text of the container element (single line only) 69 self.used = False 70 71 def __repr__(self): 72 return '{0.docname},,{0.issue},{0.line}'.format(self) 73 74 75 76class dialect(csv.excel): 77 """Our dialect: uses only linefeed as newline.""" 78 lineterminator = '\n' 79 80 81class CheckSuspiciousMarkupBuilder(Builder): 82 """ 83 Checks for possibly invalid markup that may leak into the output. 84 """ 85 name = 'suspicious' 86 logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder") 87 88 def init(self): 89 # create output file 90 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv') 91 open(self.log_file_name, 'w').close() 92 # load database of previously ignored issues 93 self.load_rules(os.path.join(os.path.dirname(__file__), '..', 94 'susp-ignored.csv')) 95 96 def get_outdated_docs(self): 97 return self.env.found_docs 98 99 def get_target_uri(self, docname, typ=None): 100 return '' 101 102 def prepare_writing(self, docnames): 103 pass 104 105 def write_doc(self, docname, doctree): 106 # set when any issue is encountered in this document 107 self.any_issue = False 108 self.docname = docname 109 visitor = SuspiciousVisitor(doctree, self) 110 doctree.walk(visitor) 111 112 def finish(self): 113 unused_rules = [rule for rule in self.rules if not rule.used] 114 if unused_rules: 115 self.logger.warning( 116 'Found %s/%s unused rules: %s' % ( 117 len(unused_rules), len(self.rules), 118 '\n'.join(repr(rule) for rule in unused_rules), 119 ) 120 ) 121 return 122 123 def check_issue(self, line, lineno, issue): 124 if not self.is_ignored(line, lineno, issue): 125 self.report_issue(line, lineno, issue) 126 127 def is_ignored(self, line, lineno, issue): 128 """Determine whether this issue should be ignored.""" 129 docname = self.docname 130 for rule in self.rules: 131 if rule.docname != docname: continue 132 if rule.issue != issue: continue 133 # Both lines must match *exactly*. This is rather strict, 134 # and probably should be improved. 135 # Doing fuzzy matches with levenshtein distance could work, 136 # but that means bringing other libraries... 137 # Ok, relax that requirement: just check if the rule fragment 138 # is contained in the document line 139 if rule.line not in line: continue 140 # Check both line numbers. If they're "near" 141 # this rule matches. (lineno=None means "don't care") 142 if (rule.lineno is not None) and \ 143 abs(rule.lineno - lineno) > 5: continue 144 # if it came this far, the rule matched 145 rule.used = True 146 return True 147 return False 148 149 def report_issue(self, text, lineno, issue): 150 self.any_issue = True 151 self.write_log_entry(lineno, issue, text) 152 self.logger.warning('[%s:%d] "%s" found in "%-.120s"' % 153 (self.docname, lineno, issue, text)) 154 self.app.statuscode = 1 155 156 def write_log_entry(self, lineno, issue, text): 157 f = open(self.log_file_name, 'a') 158 writer = csv.writer(f, dialect) 159 writer.writerow([self.docname, lineno, issue, text.strip()]) 160 f.close() 161 162 def load_rules(self, filename): 163 """Load database of previously ignored issues. 164 165 A csv file, with exactly the same format as suspicious.csv 166 Fields: document name (normalized), line number, issue, surrounding text 167 """ 168 self.logger.info("loading ignore rules... ", nonl=1) 169 self.rules = rules = [] 170 try: 171 f = open(filename, 'r') 172 except IOError: 173 return 174 for i, row in enumerate(csv.reader(f)): 175 if len(row) != 4: 176 raise ValueError( 177 "wrong format in %s, line %d: %s" % (filename, i+1, row)) 178 docname, lineno, issue, text = row 179 if lineno: 180 lineno = int(lineno) 181 else: 182 lineno = None 183 rule = Rule(docname, lineno, issue, text) 184 rules.append(rule) 185 f.close() 186 self.logger.info('done, %d rules loaded' % len(self.rules)) 187 188 189def get_lineno(node): 190 """Obtain line number information for a node.""" 191 lineno = None 192 while lineno is None and node: 193 node = node.parent 194 lineno = node.line 195 return lineno 196 197 198def extract_line(text, index): 199 """text may be a multiline string; extract 200 only the line containing the given character index. 201 202 >>> extract_line("abc\ndefgh\ni", 6) 203 >>> 'defgh' 204 >>> for i in (0, 2, 3, 4, 10): 205 ... print extract_line("abc\ndefgh\ni", i) 206 abc 207 abc 208 abc 209 defgh 210 defgh 211 i 212 """ 213 p = text.rfind('\n', 0, index) + 1 214 q = text.find('\n', index) 215 if q < 0: 216 q = len(text) 217 return text[p:q] 218 219 220class SuspiciousVisitor(nodes.GenericNodeVisitor): 221 222 lastlineno = 0 223 224 def __init__(self, document, builder): 225 nodes.GenericNodeVisitor.__init__(self, document) 226 self.builder = builder 227 228 def default_visit(self, node): 229 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers 230 text = node.astext() 231 # lineno seems to go backwards sometimes (?) 232 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno) 233 seen = set() # don't report the same issue more than only once per line 234 for match in detect_all(text): 235 issue = match.group() 236 line = extract_line(text, match.start()) 237 if (issue, line) not in seen: 238 self.builder.check_issue(line, lineno, issue) 239 seen.add((issue, line)) 240 241 unknown_visit = default_visit 242 243 def visit_document(self, node): 244 self.lastlineno = 0 245 246 def visit_comment(self, node): 247 # ignore comments -- too much false positives. 248 # (although doing this could miss some errors; 249 # there were two sections "commented-out" by mistake 250 # in the Python docs that would not be caught) 251 raise nodes.SkipNode 252