1#!/usr/bin/env python3 2 3# Copyright The Mbed TLS Contributors 4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 6""" 7This script checks the current state of the source code for minor issues, 8including incorrect file permissions, presence of tabs, non-Unix line endings, 9trailing whitespace, and presence of UTF-8 BOM. 10Note: requires python 3, must be run from Mbed TLS root. 11""" 12 13import os 14import argparse 15import logging 16import codecs 17import re 18import subprocess 19import sys 20try: 21 from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import 22except ImportError: 23 pass 24 25import scripts_path # pylint: disable=unused-import 26from mbedtls_dev import build_tree 27 28 29class FileIssueTracker: 30 """Base class for file-wide issue tracking. 31 32 To implement a checker that processes a file as a whole, inherit from 33 this class and implement `check_file_for_issue` and define ``heading``. 34 35 ``suffix_exemptions``: files whose name ends with a string in this set 36 will not be checked. 37 38 ``path_exemptions``: files whose path (relative to the root of the source 39 tree) matches this regular expression will not be checked. This can be 40 ``None`` to match no path. Paths are normalized and converted to ``/`` 41 separators before matching. 42 43 ``heading``: human-readable description of the issue 44 """ 45 46 suffix_exemptions = frozenset() #type: FrozenSet[str] 47 path_exemptions = None #type: Optional[Pattern[str]] 48 # heading must be defined in derived classes. 49 # pylint: disable=no-member 50 51 def __init__(self): 52 self.files_with_issues = {} 53 54 @staticmethod 55 def normalize_path(filepath): 56 """Normalize ``filepath`` with / as the directory separator.""" 57 filepath = os.path.normpath(filepath) 58 # On Windows, we may have backslashes to separate directories. 59 # We need slashes to match exemption lists. 60 seps = os.path.sep 61 if os.path.altsep is not None: 62 seps += os.path.altsep 63 return '/'.join(filepath.split(seps)) 64 65 def should_check_file(self, filepath): 66 """Whether the given file name should be checked. 67 68 Files whose name ends with a string listed in ``self.suffix_exemptions`` 69 or whose path matches ``self.path_exemptions`` will not be checked. 70 """ 71 for files_exemption in self.suffix_exemptions: 72 if filepath.endswith(files_exemption): 73 return False 74 if self.path_exemptions and \ 75 re.match(self.path_exemptions, self.normalize_path(filepath)): 76 return False 77 return True 78 79 def check_file_for_issue(self, filepath): 80 """Check the specified file for the issue that this class is for. 81 82 Subclasses must implement this method. 83 """ 84 raise NotImplementedError 85 86 def record_issue(self, filepath, line_number): 87 """Record that an issue was found at the specified location.""" 88 if filepath not in self.files_with_issues.keys(): 89 self.files_with_issues[filepath] = [] 90 self.files_with_issues[filepath].append(line_number) 91 92 def output_file_issues(self, logger): 93 """Log all the locations where the issue was found.""" 94 if self.files_with_issues.values(): 95 logger.info(self.heading) 96 for filename, lines in sorted(self.files_with_issues.items()): 97 if lines: 98 logger.info("{}: {}".format( 99 filename, ", ".join(str(x) for x in lines) 100 )) 101 else: 102 logger.info(filename) 103 logger.info("") 104 105BINARY_FILE_PATH_RE_LIST = [ 106 r'docs/.*\.pdf\Z', 107 r'programs/fuzz/corpuses/[^.]+\Z', 108 r'tests/data_files/[^.]+\Z', 109 r'tests/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z', 110 r'tests/data_files/.*\.req\.[^/]+\Z', 111 r'tests/data_files/.*malformed[^/]+\Z', 112 r'tests/data_files/format_pkcs12\.fmt\Z', 113 r'tests/data_files/.*\.bin\Z', 114] 115BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST)) 116 117class LineIssueTracker(FileIssueTracker): 118 """Base class for line-by-line issue tracking. 119 120 To implement a checker that processes files line by line, inherit from 121 this class and implement `line_with_issue`. 122 """ 123 124 # Exclude binary files. 125 path_exemptions = BINARY_FILE_PATH_RE 126 127 def issue_with_line(self, line, filepath, line_number): 128 """Check the specified line for the issue that this class is for. 129 130 Subclasses must implement this method. 131 """ 132 raise NotImplementedError 133 134 def check_file_line(self, filepath, line, line_number): 135 if self.issue_with_line(line, filepath, line_number): 136 self.record_issue(filepath, line_number) 137 138 def check_file_for_issue(self, filepath): 139 """Check the lines of the specified file. 140 141 Subclasses must implement the ``issue_with_line`` method. 142 """ 143 with open(filepath, "rb") as f: 144 for i, line in enumerate(iter(f.readline, b"")): 145 self.check_file_line(filepath, line, i + 1) 146 147 148def is_windows_file(filepath): 149 _root, ext = os.path.splitext(filepath) 150 return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj') 151 152 153class PermissionIssueTracker(FileIssueTracker): 154 """Track files with bad permissions. 155 156 Files that are not executable scripts must not be executable.""" 157 158 heading = "Incorrect permissions:" 159 160 # .py files can be either full scripts or modules, so they may or may 161 # not be executable. 162 suffix_exemptions = frozenset({".py"}) 163 164 def check_file_for_issue(self, filepath): 165 is_executable = os.access(filepath, os.X_OK) 166 should_be_executable = filepath.endswith((".sh", ".pl")) 167 if is_executable != should_be_executable: 168 self.files_with_issues[filepath] = None 169 170 171class ShebangIssueTracker(FileIssueTracker): 172 """Track files with a bad, missing or extraneous shebang line. 173 174 Executable scripts must start with a valid shebang (#!) line. 175 """ 176 177 heading = "Invalid shebang line:" 178 179 # Allow either /bin/sh, /bin/bash, or /usr/bin/env. 180 # Allow at most one argument (this is a Linux limitation). 181 # For sh and bash, the argument if present must be options. 182 # For env, the argument must be the base name of the interpreter. 183 _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?' 184 rb'|/usr/bin/env ([^\n /]+))$') 185 _extensions = { 186 b'bash': 'sh', 187 b'perl': 'pl', 188 b'python3': 'py', 189 b'sh': 'sh', 190 } 191 192 def is_valid_shebang(self, first_line, filepath): 193 m = re.match(self._shebang_re, first_line) 194 if not m: 195 return False 196 interpreter = m.group(1) or m.group(2) 197 if interpreter not in self._extensions: 198 return False 199 if not filepath.endswith('.' + self._extensions[interpreter]): 200 return False 201 return True 202 203 def check_file_for_issue(self, filepath): 204 is_executable = os.access(filepath, os.X_OK) 205 with open(filepath, "rb") as f: 206 first_line = f.readline() 207 if first_line.startswith(b'#!'): 208 if not is_executable: 209 # Shebang on a non-executable file 210 self.files_with_issues[filepath] = None 211 elif not self.is_valid_shebang(first_line, filepath): 212 self.files_with_issues[filepath] = [1] 213 elif is_executable: 214 # Executable without a shebang 215 self.files_with_issues[filepath] = None 216 217 218class EndOfFileNewlineIssueTracker(FileIssueTracker): 219 """Track files that end with an incomplete line 220 (no newline character at the end of the last line).""" 221 222 heading = "Missing newline at end of file:" 223 224 path_exemptions = BINARY_FILE_PATH_RE 225 226 def check_file_for_issue(self, filepath): 227 with open(filepath, "rb") as f: 228 try: 229 f.seek(-1, 2) 230 except OSError: 231 # This script only works on regular files. If we can't seek 232 # 1 before the end, it means that this position is before 233 # the beginning of the file, i.e. that the file is empty. 234 return 235 if f.read(1) != b"\n": 236 self.files_with_issues[filepath] = None 237 238 239class Utf8BomIssueTracker(FileIssueTracker): 240 """Track files that start with a UTF-8 BOM. 241 Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM.""" 242 243 heading = "UTF-8 BOM present:" 244 245 suffix_exemptions = frozenset([".vcxproj", ".sln"]) 246 path_exemptions = BINARY_FILE_PATH_RE 247 248 def check_file_for_issue(self, filepath): 249 with open(filepath, "rb") as f: 250 if f.read().startswith(codecs.BOM_UTF8): 251 self.files_with_issues[filepath] = None 252 253 254class UnicodeIssueTracker(LineIssueTracker): 255 """Track lines with invalid characters or invalid text encoding.""" 256 257 heading = "Invalid UTF-8 or forbidden character:" 258 259 # Only allow valid UTF-8, and only other explicitly allowed characters. 260 # We deliberately exclude all characters that aren't a simple non-blank, 261 # non-zero-width glyph, apart from a very small set (tab, ordinary space, 262 # line breaks, "basic" no-break space and soft hyphen). In particular, 263 # non-ASCII control characters, combinig characters, and Unicode state 264 # changes (e.g. right-to-left text) are forbidden. 265 # Note that we do allow some characters with a risk of visual confusion, 266 # for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs 267 # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs 268 # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA). 269 GOOD_CHARACTERS = ''.join([ 270 '\t\n\r -~', # ASCII (tabs and line endings are checked separately) 271 '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation) 272 '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable) 273 '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts 274 '\u2190-\u21FF', # Arrows 275 '\u2200-\u22FF', # Mathematical Symbols 276 '\u2500-\u257F' # Box Drawings characters used in markdown trees 277 ]) 278 # Allow any of the characters and ranges above, and anything classified 279 # as a word constituent. 280 GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS)) 281 282 def issue_with_line(self, line, _filepath, line_number): 283 try: 284 text = line.decode('utf-8') 285 except UnicodeDecodeError: 286 return True 287 if line_number == 1 and text.startswith('\uFEFF'): 288 # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning. 289 # Which files are allowed to have a BOM is handled in 290 # Utf8BomIssueTracker. 291 text = text[1:] 292 return not self.GOOD_CHARACTERS_RE.match(text) 293 294class UnixLineEndingIssueTracker(LineIssueTracker): 295 """Track files with non-Unix line endings (i.e. files with CR).""" 296 297 heading = "Non-Unix line endings:" 298 299 def should_check_file(self, filepath): 300 if not super().should_check_file(filepath): 301 return False 302 return not is_windows_file(filepath) 303 304 def issue_with_line(self, line, _filepath, _line_number): 305 return b"\r" in line 306 307 308class WindowsLineEndingIssueTracker(LineIssueTracker): 309 """Track files with non-Windows line endings (i.e. CR or LF not in CRLF).""" 310 311 heading = "Non-Windows line endings:" 312 313 def should_check_file(self, filepath): 314 if not super().should_check_file(filepath): 315 return False 316 return is_windows_file(filepath) 317 318 def issue_with_line(self, line, _filepath, _line_number): 319 return not line.endswith(b"\r\n") or b"\r" in line[:-2] 320 321 322class TrailingWhitespaceIssueTracker(LineIssueTracker): 323 """Track lines with trailing whitespace.""" 324 325 heading = "Trailing whitespace:" 326 suffix_exemptions = frozenset([".dsp", ".md"]) 327 328 def issue_with_line(self, line, _filepath, _line_number): 329 return line.rstrip(b"\r\n") != line.rstrip() 330 331 332class TabIssueTracker(LineIssueTracker): 333 """Track lines with tabs.""" 334 335 heading = "Tabs present:" 336 suffix_exemptions = frozenset([ 337 ".pem", # some openssl dumps have tabs 338 ".sln", 339 "/Makefile", 340 "/Makefile.inc", 341 "/generate_visualc_files.pl", 342 ]) 343 344 def issue_with_line(self, line, _filepath, _line_number): 345 return b"\t" in line 346 347 348class MergeArtifactIssueTracker(LineIssueTracker): 349 """Track lines with merge artifacts. 350 These are leftovers from a ``git merge`` that wasn't fully edited.""" 351 352 heading = "Merge artifact:" 353 354 def issue_with_line(self, line, _filepath, _line_number): 355 # Detect leftover git conflict markers. 356 if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '): 357 return True 358 if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3 359 return True 360 if line.rstrip(b'\r\n') == b'=======' and \ 361 not _filepath.endswith('.md'): 362 return True 363 return False 364 365 366class IntegrityChecker: 367 """Sanity-check files under the current directory.""" 368 369 def __init__(self, log_file): 370 """Instantiate the sanity checker. 371 Check files under the current directory. 372 Write a report of issues to log_file.""" 373 build_tree.check_repo_path() 374 self.logger = None 375 self.setup_logger(log_file) 376 self.issues_to_check = [ 377 PermissionIssueTracker(), 378 ShebangIssueTracker(), 379 EndOfFileNewlineIssueTracker(), 380 Utf8BomIssueTracker(), 381 UnicodeIssueTracker(), 382 UnixLineEndingIssueTracker(), 383 WindowsLineEndingIssueTracker(), 384 TrailingWhitespaceIssueTracker(), 385 TabIssueTracker(), 386 MergeArtifactIssueTracker(), 387 ] 388 389 def setup_logger(self, log_file, level=logging.INFO): 390 self.logger = logging.getLogger() 391 self.logger.setLevel(level) 392 if log_file: 393 handler = logging.FileHandler(log_file) 394 self.logger.addHandler(handler) 395 else: 396 console = logging.StreamHandler() 397 self.logger.addHandler(console) 398 399 @staticmethod 400 def collect_files(): 401 bytes_output = subprocess.check_output(['git', 'ls-files', '-z']) 402 bytes_filepaths = bytes_output.split(b'\0')[:-1] 403 ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths) 404 # Prepend './' to files in the top-level directory so that 405 # something like `'/Makefile' in fp` matches in the top-level 406 # directory as well as in subdirectories. 407 return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp) 408 for fp in ascii_filepaths] 409 410 def check_files(self): 411 for issue_to_check in self.issues_to_check: 412 for filepath in self.collect_files(): 413 if issue_to_check.should_check_file(filepath): 414 issue_to_check.check_file_for_issue(filepath) 415 416 def output_issues(self): 417 integrity_return_code = 0 418 for issue_to_check in self.issues_to_check: 419 if issue_to_check.files_with_issues: 420 integrity_return_code = 1 421 issue_to_check.output_file_issues(self.logger) 422 return integrity_return_code 423 424 425def run_main(): 426 parser = argparse.ArgumentParser(description=__doc__) 427 parser.add_argument( 428 "-l", "--log_file", type=str, help="path to optional output log", 429 ) 430 check_args = parser.parse_args() 431 integrity_check = IntegrityChecker(check_args.log_file) 432 integrity_check.check_files() 433 return_code = integrity_check.output_issues() 434 sys.exit(return_code) 435 436 437if __name__ == "__main__": 438 run_main() 439