1#!/usr/bin/python3
2
3import sys
4import fileinput
5import magic
6import re
7from comment_parser import comment_parser
8
9end_pats = (r'This implementation just',
10            r'Synopsis of public',
11            r'The SPU must have',
12            r'This is a simple version',
13            r'This is a dummy',
14            r'stdio_ext\.h',
15            r'python script to',
16            r'libgen\.h',
17            r'Id:.*Exp',
18            r'sccs\.',
19            r'Tests gleaned',
20            r'tar\.h',
21            r'tzcalc_limits\.c',
22            r'dummy file',
23            r'Rearranged for general',
24            r'sincos',
25            r'POSIX',
26            r'Reentrant',
27            r'Copied',
28            r'These are',
29            r'creat',
30            r'ARM configuration',
31            r'Place holder',
32            r'local header',
33            r'GNU variant',
34            r'default reentrant',
35            r'A replacement',
36            r'The signgam',
37            r'dummy header',
38            r'Uniset',
39            r'wcsftime\.c',
40            r'month_lengths\.c',
41            r'Static instance',
42            r'Conversion is performed',
43            r'Common routine',
44            r'l64a')
45
46end_pats_s = (r'FUNCTION',)
47
48end_res = []
49for pat in end_pats:
50        regex = re.compile(pat, re.I)
51        end_res += [regex]
52
53for pat in end_pats_s:
54        regex = re.compile(pat)
55        end_res += [regex]
56
57left_res = re.compile(r'^[^A-Za-z0-9]*(.*)')
58right_res = re.compile(r'(.*)[ /*\t]$')
59
60def clean_copyright(string):
61        copyright = []
62        have_cpr = False
63        cpr_line = re.compile(r'copyright.*(20[0-2][0-9]|19[7-9][0-9])', re.I)
64        upper = re.compile(r'[A-Z]')
65        lower = re.compile(r'[a-z]')
66        modified = re.compile(r'Modified')
67        derived = re.compile(r'code is derived from software', re.I)
68        skipping = False
69        only_upper = False
70        for line in string.splitlines():
71                m = cpr_line.search(line)
72                if m:
73                        line = line[m.start():]
74                        m = re.match(r'(.*<[^>]+>).*', line)
75                        if m:
76                                line = line[:m.end()]
77                        have_cpr = True
78
79                if not have_cpr:
80                        continue
81
82                end = False
83                for regex in end_res:
84                        if regex.search(line):
85                                end = True
86                                break
87                if end:
88                        break
89
90                if modified.search(line):
91                        skipping = True
92                        continue
93                if derived.search(line):
94                        skipping = True
95                        continue
96                if only_upper:
97                        if lower.search(line):
98                                break
99                elif upper.search(line) and not lower.search(line):
100                        only_upper = True
101
102                line = left_res.match(line).group(1)
103                while True:
104                        m = right_res.match(line)
105                        if not m:
106                                break
107                        line = m.group(1)
108                if skipping:
109                        if len(line) == 0:
110                                skipping = False
111                        continue
112                copyright += [line]
113        t = '\n'.join(copyright).strip()
114        return t
115
116def get_license_type(copyright):
117        if re.match(r'^no support for.*', copyright):
118                return "Default"
119        if copyright == 'none' or copyright == '':
120                return "Default"
121        if re.search(r'NetBSD', copyright):
122                return "NetBSD"
123        if (re.search(r'Redistributions *(of)? *source +code', copyright) and
124            re.search(r'Redistributions +in +binary +form', copyright)):
125           if re.search(r'[tT]he +names? of', copyright):
126                   return "BSD3"
127           else:
128                   return "BSD2"
129        if (re.search(r'University', copyright) and
130            re.search(r'California', copyright) and
131            re.search(r'Berkeley', copyright)):
132                return "UCB"
133        if re.search(r'FreeBSD', copyright):
134                return "FreeBSD"
135        if (re.search(r'AGPL', copyright) or
136            re.search(r'Affero General Public License', copyright)):
137                if re.search(r'version 3', copyright):
138                        return "AGPL3"
139                return "AGPL"
140        if re.search(r'the GPL', copyright):
141                if re.search(r'version 2', copyright):
142                        return "GPL2"
143                if re.search(r'version 3', copyright):
144                        return "GPL3"
145                return "GPL"
146        return "Other"
147
148def find_copyright_source(name, mime):
149        with open(name, 'rb') as f:
150                text = f.read().decode('utf-8', errors='ignore')
151                try:
152                        comments = comment_parser.extract_comments_from_str(text, mime)
153                except:
154                        return None
155        tog = re.compile(r'.*Open Group Base Specification', re.I | re.S)
156        m = re.compile(r'.*copyright.*', re.I | re.S)
157        for i in range(len(comments)):
158                comment = comments[i]
159                if tog.match(comment.text()):
160                        continue
161                if m.match(comment.text()):
162                        bits = comment.text()
163                        line = comments[i].line_number() + 1
164                        i += 1
165                        while i < len(comments):
166                                if comments[i].line_number() != line:
167                                        break
168                                bits += '\n' + comments[i].text()
169                                i += 1
170                                line += 1
171                        return clean_copyright(bits)
172        return None
173
174# Copyright holder is generally the first paragraph of the text
175
176def pick_split(split,match,use_end):
177        if not match:
178                return split
179        start = match.start()
180        end = match.start()
181        if use_end:
182                end = match.end()
183        if not split:
184                return (start, end)
185        if start < split[0]:
186                return (start, end)
187        return split
188
189def get_split(copyright):
190        lastcopy = None
191        for lastcopy in re.finditer(r'copyright (\(c\))? *(19[7-9][0-9]|20[0-2][0-9])[^A-Z]*[A-Z][^\n]*', copyright, re.I|re.S):
192                pass
193        if lastcopy:
194                para = None
195                for para in re.finditer(r'\n\n', copyright, re.S):
196                        if para.start() >= lastcopy.end():
197                                break
198                all = None
199                for all in re.finditer(r'all rights reserved\.?', copyright, re.I | re.S):
200                        if all.start() >= lastcopy.end():
201                                break
202        else:
203                para = re.search(r'\n\n', copyright, re.S)
204                all = re.search(r'all rights reserved\.?', copyright, re.I | re.S)
205        licensed = re.search(r'This ', copyright, re.I | re.S)
206        portions = re.search(r'Portions', copyright, re.I | re.S)
207        mod = re.search(r'modification', copyright, re.I | re.S)
208        perm = re.search(r'permission to', copyright, re.I | re.S)
209        public = re.search(r'public domain', copyright, re.I | re.S)
210
211        split = None
212        split = pick_split(split, all, True)
213        if not split:
214                split = pick_split(split, para, True)
215        split = pick_split(split, licensed, False)
216        split = pick_split(split, portions, False)
217        split = pick_split(split, mod, False)
218        split = pick_split(split, perm, False)
219        split = pick_split(split, public, False)
220        return split
221
222def get_holder(copyright):
223        split = get_split(copyright)
224        if split:
225                return copyright[:split[0]]
226        return copyright
227
228def get_license(copyright):
229        split = get_split(copyright)
230        if split:
231                copyright = copyright[split[1]:]
232                m = re.match(r'^[ \t]*\n[ \t]*\n*', copyright, re.I|re.S)
233                if m:
234                        copyright = copyright[m.end():]
235                return copyright
236        if copyright == "unknown file type":
237                return copyright
238        return ""
239
240def pound_comments(name):
241        l = re.compile(r'^#[ \t]*(.*)$')
242        comments = []
243        comment = []
244        for line in open(name).readlines():
245                m = l.match(line)
246                if m:
247                        after = m.group(1)
248                        comment += [after]
249                elif comment:
250                        comments += ['\n'.join(comment)]
251                        comment = []
252        if comment:
253                comments += ['\n'.join(comment)]
254        return comments
255
256def find_copyright_pound(name):
257        comments = pound_comments(name)
258        m = re.compile(r'.*copyright.*', re.I | re.S)
259        for comment in comments:
260                if m.match(comment):
261                        return clean_copyright(comment)
262        return None
263
264def semi_comments(name):
265        l = re.compile(r'^;;*[ \t]*(.*)$')
266        comments = []
267        comment = []
268        for line in open(name).readlines():
269                m = l.match(line)
270                if m:
271                        after = m.group(1)
272                        comment += [after]
273                elif comment:
274                        comments += ['\n'.join(comment)]
275                        comment = []
276        if comment:
277                comments += ['\n'.join(comment)]
278        return comments
279
280def find_copyright_semi(name):
281        comments = semi_comments(name)
282        m = re.compile(r'.*copyright.*', re.I | re.S)
283        for comment in comments:
284                if m.match(comment):
285                        return clean_copyright(comment)
286        return None
287
288def troff_comments(name):
289        l = re.compile(r'^\.\\"[ \t]*(.*)$')
290        comments = []
291        comment = []
292        for line in open(name).readlines():
293                m = l.match(line)
294                if m:
295                        after = m.group(1)
296                        comment += [after]
297                elif comment:
298                        comments += ['\n'.join(comment)]
299                        comment = []
300        if comment:
301                comments += ['\n'.join(comment)]
302        return comments
303
304def find_copyright_troff(name):
305        comments = troff_comments(name)
306        m = re.compile(r'.*copyright.*', re.I | re.S)
307        for comment in comments:
308                if m.match(comment):
309                        return clean_copyright(comment)
310        return None
311
312def clean(str,chars):
313        out = ""
314        for c in str:
315                if c not in chars:
316                        out += c.lower()
317        return out
318
319def pack_copyright(copyright):
320        return clean(copyright, " .,!*\n\t")
321
322def num_lines(name):
323        with open(name, 'rb') as f:
324                text = f.read().decode('utf-8', errors='ignore').splitlines()
325                return len(text)
326
327def starts_with(pattern, name):
328        with open(name, 'rb') as f:
329                text = f.read().decode('utf-8', errors='ignore')
330                m = re.search(pattern, text)
331                return m and m.start() == 0
332
333def file_contains(pattern, name):
334        with open(name, 'rb') as f:
335                text = f.read().decode('utf-8', errors='ignore')
336                return re.search(pattern, text)
337
338def main():
339        names = []
340        for name in sys.argv[1:]:
341                if name == '-':
342                        for line in sys.stdin:
343                                names += [line.strip()]
344                else:
345                        names += [name]
346        copyright_users = {}
347        copyrights = {}
348        copyright_files = {}
349        for name in names:
350                copyright = None
351
352                # Data files don't need a license
353
354                if re.match(r'.*\.t$', name):
355                        continue
356                if re.match(r'.*\.cct$', name):
357                        continue
358                if re.match(r'.*ChangeLog.*', name):
359                        continue
360                if re.match(r'.*COPYING.*', name):
361                        continue
362                if re.match(r'.*NEWS.*', name):
363                        continue
364                if re.match(r'.*MAINTAINERS', name):
365                        continue
366                if re.match(r'CODE_OF_CONDUCT.*', name):
367                        continue
368                if re.match(r'CONTRIBUTING.*', name):
369                        continue
370
371                if re.match(r'.*\.[ch]$', name) or re.match(r'.*\.ld$', name) or re.match(r'.*\.[ch]\.in$', name):
372                        copyright = find_copyright_source(name, 'text/x-c')
373                elif re.match(r'.*\.[sS]$', name):
374                        copyright = find_copyright_semi(name)
375                        if not copyright:
376                                copyright = find_copyright_source(name, 'text/x-c')
377                elif re.match(r'.*meson.*', name) or re.match(r'.*Makefile.*', name):
378                        copyright = find_copyright_pound(name)
379                elif re.match(r'.*\.[123]$', name):
380                        copyright = find_copyright_troff(name)
381                if not copyright:
382                        m = magic.from_file(name)
383                        if m is None:
384                                copyright = 'unknown file type'
385                        else:
386                                if re.search(r'troff', m):
387                                        copyright = find_copyright_troff(name)
388                                elif re.search(r'C source', m):
389                                        copyright = find_copyright_source(name, 'text/x-c')
390                                elif (re.search(r'POSIX shell script', m) or
391                                      re.search(r'Bourne-Again shell script', m) or
392                                      re.search(r'Python script', m) or
393                                      re.search(r'Perl script', m) or
394                                      starts_with(r'#', name)):
395                                        copyright = find_copyright_pound(name)
396                if not copyright:
397
398                        # Skip very short files without a copyright
399
400                        if num_lines(name) < 10:
401                                continue
402
403                        # Skip generated files
404
405                        if (file_contains(r'generated automatically', name) or
406                            file_contains(r'automatically generated', name)):
407                                continue
408
409                        copyright = ''
410
411                i = pack_copyright(copyright)
412                if i not in copyrights:
413                        copyrights[i] = copyright
414                        copyright_users[i] = [name]
415                else:
416                        copyright_users[i] += [name]
417
418        license_map = {}
419        licenses = {}
420        license_ids = {}
421        license_name = {}
422        holders = {}
423        copyright_map = {}
424
425        for i in copyrights:
426                holder = get_holder(copyrights[i])
427                license = get_license(copyrights[i])
428                license_key = pack_copyright(license)
429                type = get_license_type(license)
430                if license_key in license_name:
431                        name = license_name[license_key]
432                else:
433                        if type in license_ids:
434                                id = license_ids[type]
435                        else:
436                                id = 1
437                        license_ids[type] = id + 1
438                        name = "%s-%d" % (type, id)
439                        licenses[name] = license
440                        holders[name] = holder
441                        license_name[license_key] = name
442                if name in copyright_map:
443                        copyright_map[name] += [i]
444                else:
445                        copyright_map[name] = [i]
446
447        for type in sorted(license_ids.keys()):
448                for id in range(1,license_ids[type]):
449
450                        name = "%s-%d" % (type, id)
451
452                        for i in copyright_map[name]:
453                                holder = get_holder(copyrights[i])
454                                license = get_license(copyrights[i])
455
456                                print("Files:", end='')
457                                for file in copyright_users[i]:
458                                        print(" %s" % file)
459                                print("Copyright:", end='')
460                                done = False
461                                for line in holder.splitlines():
462                                        for pat in (r'(.*)copyright(.*)',
463                                                    r'(.*)©(.*)', r'(.*)\(c\)(.*)',
464                                                    r'(.*) by (.*)',
465                                                    r'(.*)all rights reserved\.?(.*)'):
466                                                m = re.match(pat, line, re.I)
467                                                if m:
468                                                        line = m.group(1).strip() + ' ' + m.group(2).strip()
469                                        line = line.strip()
470                                        if re.match(r'(19[89][0-9]|2[0-2][0-9][0-9]).*', line):
471                                                if done:
472                                                        print()
473                                        print(' %s' % line, end='')
474                                        done = True
475                                print()
476                                print("License: %s" % name)
477                                print()
478
479        started = False
480        for type in sorted(license_ids.keys()):
481                for id in range(1,license_ids[type]):
482                        if started:
483                                print()
484                        started = True
485                        name = "%s-%d" % (type, id)
486                        print("License: %s" % name)
487                        for line in licenses[name].splitlines():
488                                if len(line) == 0:
489                                        line = '.'
490                                print(" %s" % line)
491main()
492