1#!/usr/bin/python3 2 3import sys 4import fileinput 5import magic 6import re 7from comment_parser import comment_parser 8 9end_pats = (r'This implementation just', 10 r'Synopsis of public', 11 r'The SPU must have', 12 r'This is a simple version', 13 r'This is a dummy', 14 r'stdio_ext\.h', 15 r'python script to', 16 r'libgen\.h', 17 r'Id:.*Exp', 18 r'sccs\.', 19 r'Tests gleaned', 20 r'tar\.h', 21 r'tzcalc_limits\.c', 22 r'dummy file', 23 r'Rearranged for general', 24 r'sincos', 25 r'POSIX', 26 r'Reentrant', 27 r'Copied', 28 r'These are', 29 r'creat', 30 r'ARM configuration', 31 r'Place holder', 32 r'local header', 33 r'GNU variant', 34 r'default reentrant', 35 r'A replacement', 36 r'The signgam', 37 r'dummy header', 38 r'Uniset', 39 r'wcsftime\.c', 40 r'month_lengths\.c', 41 r'Static instance', 42 r'Conversion is performed', 43 r'Common routine', 44 r'l64a') 45 46end_pats_s = (r'FUNCTION',) 47 48end_res = [] 49for pat in end_pats: 50 regex = re.compile(pat, re.I) 51 end_res += [regex] 52 53for pat in end_pats_s: 54 regex = re.compile(pat) 55 end_res += [regex] 56 57left_res = re.compile(r'^[^A-Za-z0-9]*(.*)') 58right_res = re.compile(r'(.*)[ /*\t]$') 59 60def clean_copyright(string): 61 copyright = [] 62 have_cpr = False 63 cpr_line = re.compile(r'copyright.*(20[0-2][0-9]|19[7-9][0-9])', re.I) 64 upper = re.compile(r'[A-Z]') 65 lower = re.compile(r'[a-z]') 66 modified = re.compile(r'Modified') 67 derived = re.compile(r'code is derived from software', re.I) 68 skipping = False 69 only_upper = False 70 for line in string.splitlines(): 71 m = cpr_line.search(line) 72 if m: 73 line = line[m.start():] 74 m = re.match(r'(.*<[^>]+>).*', line) 75 if m: 76 line = line[:m.end()] 77 have_cpr = True 78 79 if not have_cpr: 80 continue 81 82 end = False 83 for regex in end_res: 84 if regex.search(line): 85 end = True 86 break 87 if end: 88 break 89 90 if modified.search(line): 91 skipping = True 92 continue 93 if derived.search(line): 94 skipping = True 95 continue 96 if only_upper: 97 if lower.search(line): 98 break 99 elif upper.search(line) and not lower.search(line): 100 only_upper = True 101 102 line = left_res.match(line).group(1) 103 while True: 104 m = right_res.match(line) 105 if not m: 106 break 107 line = m.group(1) 108 if skipping: 109 if len(line) == 0: 110 skipping = False 111 continue 112 copyright += [line] 113 t = '\n'.join(copyright).strip() 114 return t 115 116def get_license_type(copyright): 117 if re.match(r'^no support for.*', copyright): 118 return "Default" 119 if copyright == 'none' or copyright == '': 120 return "Default" 121 if re.search(r'NetBSD', copyright): 122 return "NetBSD" 123 if (re.search(r'Redistributions *(of)? *source +code', copyright) and 124 re.search(r'Redistributions +in +binary +form', copyright)): 125 if re.search(r'[tT]he +names? of', copyright): 126 return "BSD3" 127 else: 128 return "BSD2" 129 if (re.search(r'University', copyright) and 130 re.search(r'California', copyright) and 131 re.search(r'Berkeley', copyright)): 132 return "UCB" 133 if re.search(r'FreeBSD', copyright): 134 return "FreeBSD" 135 if (re.search(r'AGPL', copyright) or 136 re.search(r'Affero General Public License', copyright)): 137 if re.search(r'version 3', copyright): 138 return "AGPL3" 139 return "AGPL" 140 if re.search(r'the GPL', copyright): 141 if re.search(r'version 2', copyright): 142 return "GPL2" 143 if re.search(r'version 3', copyright): 144 return "GPL3" 145 return "GPL" 146 return "Other" 147 148def find_copyright_source(name, mime): 149 with open(name, 'rb') as f: 150 text = f.read().decode('utf-8', errors='ignore') 151 try: 152 comments = comment_parser.extract_comments_from_str(text, mime) 153 except: 154 return None 155 tog = re.compile(r'.*Open Group Base Specification', re.I | re.S) 156 m = re.compile(r'.*copyright.*', re.I | re.S) 157 for i in range(len(comments)): 158 comment = comments[i] 159 if tog.match(comment.text()): 160 continue 161 if m.match(comment.text()): 162 bits = comment.text() 163 line = comments[i].line_number() + 1 164 i += 1 165 while i < len(comments): 166 if comments[i].line_number() != line: 167 break 168 bits += '\n' + comments[i].text() 169 i += 1 170 line += 1 171 return clean_copyright(bits) 172 return None 173 174# Copyright holder is generally the first paragraph of the text 175 176def pick_split(split,match,use_end): 177 if not match: 178 return split 179 start = match.start() 180 end = match.start() 181 if use_end: 182 end = match.end() 183 if not split: 184 return (start, end) 185 if start < split[0]: 186 return (start, end) 187 return split 188 189def get_split(copyright): 190 lastcopy = None 191 for lastcopy in re.finditer(r'copyright (\(c\))? *(19[7-9][0-9]|20[0-2][0-9])[^A-Z]*[A-Z][^\n]*', copyright, re.I|re.S): 192 pass 193 if lastcopy: 194 para = None 195 for para in re.finditer(r'\n\n', copyright, re.S): 196 if para.start() >= lastcopy.end(): 197 break 198 all = None 199 for all in re.finditer(r'all rights reserved\.?', copyright, re.I | re.S): 200 if all.start() >= lastcopy.end(): 201 break 202 else: 203 para = re.search(r'\n\n', copyright, re.S) 204 all = re.search(r'all rights reserved\.?', copyright, re.I | re.S) 205 licensed = re.search(r'This ', copyright, re.I | re.S) 206 portions = re.search(r'Portions', copyright, re.I | re.S) 207 mod = re.search(r'modification', copyright, re.I | re.S) 208 perm = re.search(r'permission to', copyright, re.I | re.S) 209 public = re.search(r'public domain', copyright, re.I | re.S) 210 211 split = None 212 split = pick_split(split, all, True) 213 if not split: 214 split = pick_split(split, para, True) 215 split = pick_split(split, licensed, False) 216 split = pick_split(split, portions, False) 217 split = pick_split(split, mod, False) 218 split = pick_split(split, perm, False) 219 split = pick_split(split, public, False) 220 return split 221 222def get_holder(copyright): 223 split = get_split(copyright) 224 if split: 225 return copyright[:split[0]] 226 return copyright 227 228def get_license(copyright): 229 split = get_split(copyright) 230 if split: 231 copyright = copyright[split[1]:] 232 m = re.match(r'^[ \t]*\n[ \t]*\n*', copyright, re.I|re.S) 233 if m: 234 copyright = copyright[m.end():] 235 return copyright 236 if copyright == "unknown file type": 237 return copyright 238 return "" 239 240def pound_comments(name): 241 l = re.compile(r'^#[ \t]*(.*)$') 242 comments = [] 243 comment = [] 244 for line in open(name).readlines(): 245 m = l.match(line) 246 if m: 247 after = m.group(1) 248 comment += [after] 249 elif comment: 250 comments += ['\n'.join(comment)] 251 comment = [] 252 if comment: 253 comments += ['\n'.join(comment)] 254 return comments 255 256def find_copyright_pound(name): 257 comments = pound_comments(name) 258 m = re.compile(r'.*copyright.*', re.I | re.S) 259 for comment in comments: 260 if m.match(comment): 261 return clean_copyright(comment) 262 return None 263 264def semi_comments(name): 265 l = re.compile(r'^;;*[ \t]*(.*)$') 266 comments = [] 267 comment = [] 268 for line in open(name).readlines(): 269 m = l.match(line) 270 if m: 271 after = m.group(1) 272 comment += [after] 273 elif comment: 274 comments += ['\n'.join(comment)] 275 comment = [] 276 if comment: 277 comments += ['\n'.join(comment)] 278 return comments 279 280def find_copyright_semi(name): 281 comments = semi_comments(name) 282 m = re.compile(r'.*copyright.*', re.I | re.S) 283 for comment in comments: 284 if m.match(comment): 285 return clean_copyright(comment) 286 return None 287 288def troff_comments(name): 289 l = re.compile(r'^\.\\"[ \t]*(.*)$') 290 comments = [] 291 comment = [] 292 for line in open(name).readlines(): 293 m = l.match(line) 294 if m: 295 after = m.group(1) 296 comment += [after] 297 elif comment: 298 comments += ['\n'.join(comment)] 299 comment = [] 300 if comment: 301 comments += ['\n'.join(comment)] 302 return comments 303 304def find_copyright_troff(name): 305 comments = troff_comments(name) 306 m = re.compile(r'.*copyright.*', re.I | re.S) 307 for comment in comments: 308 if m.match(comment): 309 return clean_copyright(comment) 310 return None 311 312def clean(str,chars): 313 out = "" 314 for c in str: 315 if c not in chars: 316 out += c.lower() 317 return out 318 319def pack_copyright(copyright): 320 return clean(copyright, " .,!*\n\t") 321 322def num_lines(name): 323 with open(name, 'rb') as f: 324 text = f.read().decode('utf-8', errors='ignore').splitlines() 325 return len(text) 326 327def starts_with(pattern, name): 328 with open(name, 'rb') as f: 329 text = f.read().decode('utf-8', errors='ignore') 330 m = re.search(pattern, text) 331 return m and m.start() == 0 332 333def file_contains(pattern, name): 334 with open(name, 'rb') as f: 335 text = f.read().decode('utf-8', errors='ignore') 336 return re.search(pattern, text) 337 338def main(): 339 names = [] 340 for name in sys.argv[1:]: 341 if name == '-': 342 for line in sys.stdin: 343 names += [line.strip()] 344 else: 345 names += [name] 346 copyright_users = {} 347 copyrights = {} 348 copyright_files = {} 349 for name in names: 350 copyright = None 351 352 # Data files don't need a license 353 354 if re.match(r'.*\.t$', name): 355 continue 356 if re.match(r'.*\.cct$', name): 357 continue 358 if re.match(r'.*ChangeLog.*', name): 359 continue 360 if re.match(r'.*COPYING.*', name): 361 continue 362 if re.match(r'.*NEWS.*', name): 363 continue 364 if re.match(r'.*MAINTAINERS', name): 365 continue 366 if re.match(r'CODE_OF_CONDUCT.*', name): 367 continue 368 if re.match(r'CONTRIBUTING.*', name): 369 continue 370 371 if re.match(r'.*\.[ch]$', name) or re.match(r'.*\.ld$', name) or re.match(r'.*\.[ch]\.in$', name): 372 copyright = find_copyright_source(name, 'text/x-c') 373 elif re.match(r'.*\.[sS]$', name): 374 copyright = find_copyright_semi(name) 375 if not copyright: 376 copyright = find_copyright_source(name, 'text/x-c') 377 elif re.match(r'.*meson.*', name) or re.match(r'.*Makefile.*', name): 378 copyright = find_copyright_pound(name) 379 elif re.match(r'.*\.[123]$', name): 380 copyright = find_copyright_troff(name) 381 if not copyright: 382 m = magic.from_file(name) 383 if m is None: 384 copyright = 'unknown file type' 385 else: 386 if re.search(r'troff', m): 387 copyright = find_copyright_troff(name) 388 elif re.search(r'C source', m): 389 copyright = find_copyright_source(name, 'text/x-c') 390 elif (re.search(r'POSIX shell script', m) or 391 re.search(r'Bourne-Again shell script', m) or 392 re.search(r'Python script', m) or 393 re.search(r'Perl script', m) or 394 starts_with(r'#', name)): 395 copyright = find_copyright_pound(name) 396 if not copyright: 397 398 # Skip very short files without a copyright 399 400 if num_lines(name) < 10: 401 continue 402 403 # Skip generated files 404 405 if (file_contains(r'generated automatically', name) or 406 file_contains(r'automatically generated', name)): 407 continue 408 409 copyright = '' 410 411 i = pack_copyright(copyright) 412 if i not in copyrights: 413 copyrights[i] = copyright 414 copyright_users[i] = [name] 415 else: 416 copyright_users[i] += [name] 417 418 license_map = {} 419 licenses = {} 420 license_ids = {} 421 license_name = {} 422 holders = {} 423 copyright_map = {} 424 425 for i in copyrights: 426 holder = get_holder(copyrights[i]) 427 license = get_license(copyrights[i]) 428 license_key = pack_copyright(license) 429 type = get_license_type(license) 430 if license_key in license_name: 431 name = license_name[license_key] 432 else: 433 if type in license_ids: 434 id = license_ids[type] 435 else: 436 id = 1 437 license_ids[type] = id + 1 438 name = "%s-%d" % (type, id) 439 licenses[name] = license 440 holders[name] = holder 441 license_name[license_key] = name 442 if name in copyright_map: 443 copyright_map[name] += [i] 444 else: 445 copyright_map[name] = [i] 446 447 for type in sorted(license_ids.keys()): 448 for id in range(1,license_ids[type]): 449 450 name = "%s-%d" % (type, id) 451 452 for i in copyright_map[name]: 453 holder = get_holder(copyrights[i]) 454 license = get_license(copyrights[i]) 455 456 print("Files:", end='') 457 for file in copyright_users[i]: 458 print(" %s" % file) 459 print("Copyright:", end='') 460 done = False 461 for line in holder.splitlines(): 462 for pat in (r'(.*)copyright(.*)', 463 r'(.*)©(.*)', r'(.*)\(c\)(.*)', 464 r'(.*) by (.*)', 465 r'(.*)all rights reserved\.?(.*)'): 466 m = re.match(pat, line, re.I) 467 if m: 468 line = m.group(1).strip() + ' ' + m.group(2).strip() 469 line = line.strip() 470 if re.match(r'(19[89][0-9]|2[0-2][0-9][0-9]).*', line): 471 if done: 472 print() 473 print(' %s' % line, end='') 474 done = True 475 print() 476 print("License: %s" % name) 477 print() 478 479 started = False 480 for type in sorted(license_ids.keys()): 481 for id in range(1,license_ids[type]): 482 if started: 483 print() 484 started = True 485 name = "%s-%d" % (type, id) 486 print("License: %s" % name) 487 for line in licenses[name].splitlines(): 488 if len(line) == 0: 489 line = '.' 490 print(" %s" % line) 491main() 492