1#!/usr/bin/env python3 2 3# Copyright The Mbed TLS Contributors 4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5 6""" 7This script checks the current state of the source code for minor issues, 8including incorrect file permissions, presence of tabs, non-Unix line endings, 9trailing whitespace, and presence of UTF-8 BOM. 10Note: requires python 3, must be run from Mbed TLS root. 11""" 12 13import argparse 14import codecs 15import inspect 16import logging 17import os 18import re 19import subprocess 20import sys 21try: 22 from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import 23except ImportError: 24 pass 25 26import scripts_path # pylint: disable=unused-import 27from mbedtls_framework import build_tree 28 29 30class FileIssueTracker: 31 """Base class for file-wide issue tracking. 32 33 To implement a checker that processes a file as a whole, inherit from 34 this class and implement `check_file_for_issue` and define ``heading``. 35 36 ``suffix_exemptions``: files whose name ends with a string in this set 37 will not be checked. 38 39 ``path_exemptions``: files whose path (relative to the root of the source 40 tree) matches this regular expression will not be checked. This can be 41 ``None`` to match no path. Paths are normalized and converted to ``/`` 42 separators before matching. 43 44 ``heading``: human-readable description of the issue 45 """ 46 47 suffix_exemptions = frozenset() #type: FrozenSet[str] 48 path_exemptions = None #type: Optional[Pattern[str]] 49 # heading must be defined in derived classes. 50 # pylint: disable=no-member 51 52 def __init__(self): 53 self.files_with_issues = {} 54 55 @staticmethod 56 def normalize_path(filepath): 57 """Normalize ``filepath`` with / as the directory separator.""" 58 filepath = os.path.normpath(filepath) 59 # On Windows, we may have backslashes to separate directories. 60 # We need slashes to match exemption lists. 61 seps = os.path.sep 62 if os.path.altsep is not None: 63 seps += os.path.altsep 64 return '/'.join(filepath.split(seps)) 65 66 def should_check_file(self, filepath): 67 """Whether the given file name should be checked. 68 69 Files whose name ends with a string listed in ``self.suffix_exemptions`` 70 or whose path matches ``self.path_exemptions`` will not be checked. 71 """ 72 for files_exemption in self.suffix_exemptions: 73 if filepath.endswith(files_exemption): 74 return False 75 if self.path_exemptions and \ 76 re.match(self.path_exemptions, self.normalize_path(filepath)): 77 return False 78 return True 79 80 def check_file_for_issue(self, filepath): 81 """Check the specified file for the issue that this class is for. 82 83 Subclasses must implement this method. 84 """ 85 raise NotImplementedError 86 87 def record_issue(self, filepath, line_number): 88 """Record that an issue was found at the specified location.""" 89 if filepath not in self.files_with_issues.keys(): 90 self.files_with_issues[filepath] = [] 91 self.files_with_issues[filepath].append(line_number) 92 93 def output_file_issues(self, logger): 94 """Log all the locations where the issue was found.""" 95 if self.files_with_issues.values(): 96 logger.info(self.heading) 97 for filename, lines in sorted(self.files_with_issues.items()): 98 if lines: 99 logger.info("{}: {}".format( 100 filename, ", ".join(str(x) for x in lines) 101 )) 102 else: 103 logger.info(filename) 104 logger.info("") 105 106BINARY_FILE_PATH_RE_LIST = [ 107 r'docs/.*\.pdf\Z', 108 r'docs/.*\.png\Z', 109 r'programs/fuzz/corpuses/[^.]+\Z', 110 r'framework/data_files/[^.]+\Z', 111 r'framework/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z', 112 r'framework/data_files/.*\.req\.[^/]+\Z', 113 r'framework/data_files/.*malformed[^/]+\Z', 114 r'framework/data_files/format_pkcs12\.fmt\Z', 115 r'framework/data_files/.*\.bin\Z', 116] 117BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST)) 118 119class LineIssueTracker(FileIssueTracker): 120 """Base class for line-by-line issue tracking. 121 122 To implement a checker that processes files line by line, inherit from 123 this class and implement `line_with_issue`. 124 """ 125 126 # Exclude binary files. 127 path_exemptions = BINARY_FILE_PATH_RE 128 129 def issue_with_line(self, line, filepath, line_number): 130 """Check the specified line for the issue that this class is for. 131 132 Subclasses must implement this method. 133 """ 134 raise NotImplementedError 135 136 def check_file_line(self, filepath, line, line_number): 137 if self.issue_with_line(line, filepath, line_number): 138 self.record_issue(filepath, line_number) 139 140 def check_file_for_issue(self, filepath): 141 """Check the lines of the specified file. 142 143 Subclasses must implement the ``issue_with_line`` method. 144 """ 145 with open(filepath, "rb") as f: 146 for i, line in enumerate(iter(f.readline, b"")): 147 self.check_file_line(filepath, line, i + 1) 148 149 150def is_windows_file(filepath): 151 _root, ext = os.path.splitext(filepath) 152 return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj') 153 154 155class ShebangIssueTracker(FileIssueTracker): 156 """Track files with a bad, missing or extraneous shebang line. 157 158 Executable scripts must start with a valid shebang (#!) line. 159 """ 160 161 heading = "Invalid shebang line:" 162 163 # Allow either /bin/sh, /bin/bash, or /usr/bin/env. 164 # Allow at most one argument (this is a Linux limitation). 165 # For sh and bash, the argument if present must be options. 166 # For env, the argument must be the base name of the interpreter. 167 _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?' 168 rb'|/usr/bin/env ([^\n /]+))$') 169 _extensions = { 170 b'bash': 'sh', 171 b'perl': 'pl', 172 b'python3': 'py', 173 b'sh': 'sh', 174 } 175 176 path_exemptions = re.compile(r'tests/scripts/quiet/.*') 177 178 def is_valid_shebang(self, first_line, filepath): 179 m = re.match(self._shebang_re, first_line) 180 if not m: 181 return False 182 interpreter = m.group(1) or m.group(2) 183 if interpreter not in self._extensions: 184 return False 185 if not filepath.endswith('.' + self._extensions[interpreter]): 186 return False 187 return True 188 189 def check_file_for_issue(self, filepath): 190 is_executable = os.access(filepath, os.X_OK) 191 with open(filepath, "rb") as f: 192 first_line = f.readline() 193 if first_line.startswith(b'#!'): 194 if not is_executable: 195 # Shebang on a non-executable file 196 self.files_with_issues[filepath] = None 197 elif not self.is_valid_shebang(first_line, filepath): 198 self.files_with_issues[filepath] = [1] 199 elif is_executable: 200 # Executable without a shebang 201 self.files_with_issues[filepath] = None 202 203 204class EndOfFileNewlineIssueTracker(FileIssueTracker): 205 """Track files that end with an incomplete line 206 (no newline character at the end of the last line).""" 207 208 heading = "Missing newline at end of file:" 209 210 path_exemptions = BINARY_FILE_PATH_RE 211 212 def check_file_for_issue(self, filepath): 213 with open(filepath, "rb") as f: 214 try: 215 f.seek(-1, 2) 216 except OSError: 217 # This script only works on regular files. If we can't seek 218 # 1 before the end, it means that this position is before 219 # the beginning of the file, i.e. that the file is empty. 220 return 221 if f.read(1) != b"\n": 222 self.files_with_issues[filepath] = None 223 224 225class Utf8BomIssueTracker(FileIssueTracker): 226 """Track files that start with a UTF-8 BOM. 227 Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM.""" 228 229 heading = "UTF-8 BOM present:" 230 231 suffix_exemptions = frozenset([".vcxproj", ".sln"]) 232 path_exemptions = BINARY_FILE_PATH_RE 233 234 def check_file_for_issue(self, filepath): 235 with open(filepath, "rb") as f: 236 if f.read().startswith(codecs.BOM_UTF8): 237 self.files_with_issues[filepath] = None 238 239 240class UnicodeIssueTracker(LineIssueTracker): 241 """Track lines with invalid characters or invalid text encoding.""" 242 243 heading = "Invalid UTF-8 or forbidden character:" 244 245 # Only allow valid UTF-8, and only other explicitly allowed characters. 246 # We deliberately exclude all characters that aren't a simple non-blank, 247 # non-zero-width glyph, apart from a very small set (tab, ordinary space, 248 # line breaks, "basic" no-break space and soft hyphen). In particular, 249 # non-ASCII control characters, combinig characters, and Unicode state 250 # changes (e.g. right-to-left text) are forbidden. 251 # Note that we do allow some characters with a risk of visual confusion, 252 # for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs 253 # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs 254 # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA). 255 GOOD_CHARACTERS = ''.join([ 256 '\t\n\r -~', # ASCII (tabs and line endings are checked separately) 257 '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation) 258 '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable) 259 '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts 260 '\u2190-\u21FF', # Arrows 261 '\u2200-\u22FF', # Mathematical Symbols 262 '\u2500-\u257F' # Box Drawings characters used in markdown trees 263 ]) 264 # Allow any of the characters and ranges above, and anything classified 265 # as a word constituent. 266 GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS)) 267 268 def issue_with_line(self, line, _filepath, line_number): 269 try: 270 text = line.decode('utf-8') 271 except UnicodeDecodeError: 272 return True 273 if line_number == 1 and text.startswith('\uFEFF'): 274 # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning. 275 # Which files are allowed to have a BOM is handled in 276 # Utf8BomIssueTracker. 277 text = text[1:] 278 return not self.GOOD_CHARACTERS_RE.match(text) 279 280class UnixLineEndingIssueTracker(LineIssueTracker): 281 """Track files with non-Unix line endings (i.e. files with CR).""" 282 283 heading = "Non-Unix line endings:" 284 285 def should_check_file(self, filepath): 286 if not super().should_check_file(filepath): 287 return False 288 return not is_windows_file(filepath) 289 290 def issue_with_line(self, line, _filepath, _line_number): 291 return b"\r" in line 292 293 294class WindowsLineEndingIssueTracker(LineIssueTracker): 295 """Track files with non-Windows line endings (i.e. CR or LF not in CRLF).""" 296 297 heading = "Non-Windows line endings:" 298 299 def should_check_file(self, filepath): 300 if not super().should_check_file(filepath): 301 return False 302 return is_windows_file(filepath) 303 304 def issue_with_line(self, line, _filepath, _line_number): 305 return not line.endswith(b"\r\n") or b"\r" in line[:-2] 306 307 308class TrailingWhitespaceIssueTracker(LineIssueTracker): 309 """Track lines with trailing whitespace.""" 310 311 heading = "Trailing whitespace:" 312 suffix_exemptions = frozenset([".dsp", ".md"]) 313 314 def issue_with_line(self, line, _filepath, _line_number): 315 return line.rstrip(b"\r\n") != line.rstrip() 316 317 318class TabIssueTracker(LineIssueTracker): 319 """Track lines with tabs.""" 320 321 heading = "Tabs present:" 322 suffix_exemptions = frozenset([ 323 ".make", 324 ".pem", # some openssl dumps have tabs 325 ".sln", 326 "/.gitmodules", 327 "/Makefile", 328 "/Makefile.inc", 329 "/generate_visualc_files.pl", 330 ]) 331 332 def issue_with_line(self, line, _filepath, _line_number): 333 return b"\t" in line 334 335 336class MergeArtifactIssueTracker(LineIssueTracker): 337 """Track lines with merge artifacts. 338 These are leftovers from a ``git merge`` that wasn't fully edited.""" 339 340 heading = "Merge artifact:" 341 342 def issue_with_line(self, line, _filepath, _line_number): 343 # Detect leftover git conflict markers. 344 if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '): 345 return True 346 if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3 347 return True 348 if line.rstrip(b'\r\n') == b'=======' and \ 349 not _filepath.endswith('.md'): 350 return True 351 return False 352 353 354def this_location(): 355 frame = inspect.currentframe() 356 assert frame is not None 357 info = inspect.getframeinfo(frame) 358 return os.path.basename(info.filename), info.lineno 359THIS_FILE_BASE_NAME, LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = this_location() 360 361class LicenseIssueTracker(LineIssueTracker): 362 """Check copyright statements and license indications. 363 364 This class only checks that statements are correct if present. It does 365 not enforce the presence of statements in each file. 366 """ 367 368 heading = "License issue:" 369 370 LICENSE_EXEMPTION_RE_LIST = [ 371 # Third-party code, other than whitelisted third-party modules, 372 # may be under a different license. 373 r'3rdparty/(?!(p256-m)/.*)', 374 # Documentation explaining the license may have accidental 375 # false positives. 376 r'(ChangeLog|LICENSE|framework\/LICENSE|[-0-9A-Z_a-z]+\.md)\Z', 377 # Files imported from TF-M, and not used except in test builds, 378 # may be under a different license. 379 r'configs/ext/crypto_config_profile_medium\.h\Z', 380 r'configs/ext/tfm_mbedcrypto_config_profile_medium\.h\Z', 381 r'configs/ext/README\.md\Z', 382 # Third-party file. 383 r'dco\.txt\Z', 384 r'framework\/dco\.txt\Z', 385 ] 386 path_exemptions = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST + 387 LICENSE_EXEMPTION_RE_LIST)) 388 389 COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors' 390 # Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc. 391 COPYRIGHT_RE = re.compile(rb'.*\bcopyright\s+((?:\w|\s|[()]|[^ -~])*\w)', re.I) 392 393 SPDX_HEADER_KEY = b'SPDX-License-Identifier' 394 LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later' 395 SPDX_RE = re.compile(br'.*?(' + 396 re.escape(SPDX_HEADER_KEY) + 397 br')(:\s*(.*?)\W*\Z|.*)', re.I) 398 399 LICENSE_MENTION_RE = re.compile(rb'.*(?:' + rb'|'.join([ 400 rb'Apache License', 401 rb'General Public License', 402 ]) + rb')', re.I) 403 404 def __init__(self): 405 super().__init__() 406 # Record what problem was caused. We can't easily report it due to 407 # the structure of the script. To be fixed after 408 # https://github.com/Mbed-TLS/mbedtls/pull/2506 409 self.problem = None 410 411 def issue_with_line(self, line, filepath, line_number): 412 #pylint: disable=too-many-return-statements 413 414 # Use endswith() rather than the more correct os.path.basename() 415 # because experimentally, it makes a significant difference to 416 # the running time. 417 if filepath.endswith(THIS_FILE_BASE_NAME) and \ 418 line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER: 419 # Avoid false positives from the code in this class. 420 # Also skip the rest of this file, which is highly unlikely to 421 # contain any problematic statements since we put those near the 422 # top of files. 423 return False 424 425 m = self.COPYRIGHT_RE.match(line) 426 if m and m.group(1) != self.COPYRIGHT_HOLDER: 427 self.problem = 'Invalid copyright line' 428 return True 429 430 m = self.SPDX_RE.match(line) 431 if m: 432 if m.group(1) != self.SPDX_HEADER_KEY: 433 self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode() 434 return True 435 if not m.group(3): 436 self.problem = 'Improperly formatted SPDX license identifier' 437 return True 438 if m.group(3) != self.LICENSE_IDENTIFIER: 439 self.problem = 'Wrong SPDX license identifier' 440 return True 441 442 m = self.LICENSE_MENTION_RE.match(line) 443 if m: 444 self.problem = 'Suspicious license mention' 445 return True 446 447 return False 448 449 450class IntegrityChecker: 451 """Sanity-check files under the current directory.""" 452 453 def __init__(self, log_file): 454 """Instantiate the sanity checker. 455 Check files under the current directory. 456 Write a report of issues to log_file.""" 457 build_tree.check_repo_path() 458 self.logger = None 459 self.setup_logger(log_file) 460 self.issues_to_check = [ 461 ShebangIssueTracker(), 462 EndOfFileNewlineIssueTracker(), 463 Utf8BomIssueTracker(), 464 UnicodeIssueTracker(), 465 UnixLineEndingIssueTracker(), 466 WindowsLineEndingIssueTracker(), 467 TrailingWhitespaceIssueTracker(), 468 TabIssueTracker(), 469 MergeArtifactIssueTracker(), 470 LicenseIssueTracker(), 471 ] 472 473 def setup_logger(self, log_file, level=logging.INFO): 474 """Log to log_file if provided, or to stderr if None.""" 475 self.logger = logging.getLogger() 476 self.logger.setLevel(level) 477 if log_file: 478 handler = logging.FileHandler(log_file) 479 self.logger.addHandler(handler) 480 else: 481 console = logging.StreamHandler() 482 self.logger.addHandler(console) 483 484 @staticmethod 485 def collect_files(): 486 """Return the list of files to check. 487 488 These are the regular files commited into Git. 489 """ 490 bytes_output = subprocess.check_output(['git', '-C', 'framework', 491 'ls-files', '-z']) 492 bytes_framework_filepaths = bytes_output.split(b'\0')[:-1] 493 bytes_framework_filepaths = ["framework/".encode() + filepath 494 for filepath in bytes_framework_filepaths] 495 496 bytes_output = subprocess.check_output(['git', 'ls-files', '-z']) 497 bytes_filepaths = bytes_output.split(b'\0')[:-1] + \ 498 bytes_framework_filepaths 499 ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths) 500 501 # Filter out directories. Normally Git doesn't list directories 502 # (it only knows about the files inside them), but there is 503 # at least one case where 'git ls-files' includes a directory: 504 # submodules. Just skip submodules (and any other directories). 505 ascii_filepaths = [fp for fp in ascii_filepaths 506 if os.path.isfile(fp)] 507 # Prepend './' to files in the top-level directory so that 508 # something like `'/Makefile' in fp` matches in the top-level 509 # directory as well as in subdirectories. 510 return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp) 511 for fp in ascii_filepaths] 512 513 def check_files(self): 514 """Check all files for all issues.""" 515 for issue_to_check in self.issues_to_check: 516 for filepath in self.collect_files(): 517 if issue_to_check.should_check_file(filepath): 518 issue_to_check.check_file_for_issue(filepath) 519 520 def output_issues(self): 521 """Log the issues found and their locations. 522 523 Return 1 if there were issues, 0 otherwise. 524 """ 525 integrity_return_code = 0 526 for issue_to_check in self.issues_to_check: 527 if issue_to_check.files_with_issues: 528 integrity_return_code = 1 529 issue_to_check.output_file_issues(self.logger) 530 return integrity_return_code 531 532 533def run_main(): 534 parser = argparse.ArgumentParser(description=__doc__) 535 parser.add_argument( 536 "-l", "--log_file", type=str, help="path to optional output log", 537 ) 538 check_args = parser.parse_args() 539 integrity_check = IntegrityChecker(check_args.log_file) 540 integrity_check.check_files() 541 return_code = integrity_check.output_issues() 542 sys.exit(return_code) 543 544 545if __name__ == "__main__": 546 run_main() 547