1# Copyright (c) 2020, 2021 The Linux Foundation
2#
3# SPDX-License-Identifier: Apache-2.0
4
5import hashlib
6import os
7import re
8
9from west import log
10
11from zspdx.licenses import LICENSES
12from zspdx.util import getHashes
13
14
15# ScannerConfig contains settings used to configure how the SPDX
16# Document scanning should occur.
17class ScannerConfig:
18    def __init__(self):
19        super(ScannerConfig, self).__init__()
20
21        # when assembling a Package's data, should we auto-conclude the
22        # Package's license, based on the licenses of its Files?
23        self.shouldConcludePackageLicense = True
24
25        # when assembling a Package's Files' data, should we auto-conclude
26        # each File's license, based on its detected license(s)?
27        self.shouldConcludeFileLicenses = True
28
29        # number of lines to scan for SPDX-License-Identifier (0 = all)
30        # defaults to 20
31        self.numLinesScanned = 20
32
33        # should we calculate SHA256 hashes for each Package's Files?
34        # note that SHA1 hashes are mandatory, per SPDX 2.3
35        self.doSHA256 = True
36
37        # should we calculate MD5 hashes for each Package's Files?
38        self.doMD5 = False
39
40
41def parseLineForExpression(line):
42    """Return parsed SPDX expression if tag found in line, or None otherwise."""
43    p = line.partition("SPDX-License-Identifier:")
44    if p[2] == "":
45        return None
46    # strip away trailing comment marks and whitespace, if any
47    expression = p[2].strip()
48    expression = expression.rstrip("/*")
49    expression = expression.strip()
50    return expression
51
52
53def getExpressionData(filePath, numLines):
54    """
55    Scans the specified file for the first SPDX-License-Identifier:
56    tag in the file.
57
58    Arguments:
59        - filePath: path to file to scan.
60        - numLines: number of lines to scan for an expression before
61                    giving up. If 0, will scan the entire file.
62    Returns: parsed expression if found; None if not found.
63    """
64    log.dbg(f"  - getting licenses for {filePath}")
65
66    with open(filePath, "r") as f:
67        try:
68            lineno = 0
69            for line in f:
70                lineno += 1
71                if lineno > numLines > 0:
72                    break
73                expression = parseLineForExpression(line)
74                if expression is not None:
75                    return expression
76        except UnicodeDecodeError:
77            # invalid UTF-8 content
78            return None
79
80    # if we get here, we didn't find an expression
81    return None
82
83
84def splitExpression(expression):
85    """
86    Parse a license expression into its constituent identifiers.
87
88    Arguments:
89        - expression: SPDX license expression
90    Returns: array of split identifiers
91    """
92    # remove parens and plus sign
93    e2 = re.sub(r'\(|\)|\+', "", expression, flags=re.IGNORECASE)
94
95    # remove word operators, ignoring case, leaving a blank space
96    e3 = re.sub(r' AND | OR | WITH ', " ", e2, flags=re.IGNORECASE)
97
98    # and split on space
99    e4 = e3.split(" ")
100
101    return sorted(e4)
102
103
104def calculateVerificationCode(pkg):
105    """
106    Calculate the SPDX Package Verification Code for all files in the package.
107
108    Arguments:
109        - pkg: Package
110    Returns: verification code as string
111    """
112    hashes = []
113    for f in pkg.files.values():
114        hashes.append(f.sha1)
115    hashes.sort()
116    filelist = "".join(hashes)
117
118    hSHA1 = hashlib.sha1()
119    hSHA1.update(filelist.encode('utf-8'))
120    return hSHA1.hexdigest()
121
122
123def checkLicenseValid(lic, doc):
124    """
125    Check whether this license ID is a valid SPDX license ID, and add it
126    to the custom license IDs set for this Document if it isn't.
127
128    Arguments:
129        - lic: detected license ID
130        - doc: Document
131    """
132    if lic not in LICENSES:
133        doc.customLicenseIDs.add(lic)
134
135
136def getPackageLicenses(pkg):
137    """
138    Extract lists of all concluded and infoInFile licenses seen.
139
140    Arguments:
141        - pkg: Package
142    Returns: sorted list of concluded license exprs,
143             sorted list of infoInFile ID's
144    """
145    licsConcluded = set()
146    licsFromFiles = set()
147    for f in pkg.files.values():
148        licsConcluded.add(f.concludedLicense)
149        for licInfo in f.licenseInfoInFile:
150            licsFromFiles.add(licInfo)
151    return sorted(list(licsConcluded)), sorted(list(licsFromFiles))
152
153
154def normalizeExpression(licsConcluded):
155    """
156    Combine array of license expressions into one AND'd expression,
157    adding parens where needed.
158
159    Arguments:
160        - licsConcluded: array of license expressions
161    Returns: string with single AND'd expression.
162    """
163    # return appropriate for simple cases
164    if len(licsConcluded) == 0:
165        return "NOASSERTION"
166    if len(licsConcluded) == 1:
167        return licsConcluded[0]
168
169    # more than one, so we'll need to combine them
170    # if and only if an expression has spaces, it needs parens
171    revised = []
172    for lic in licsConcluded:
173        if lic in ["NONE", "NOASSERTION"]:
174            continue
175        if " " in lic:
176            revised.append(f"({lic})")
177        else:
178            revised.append(lic)
179    return " AND ".join(revised)
180
181
182def scanDocument(cfg, doc):
183    """
184    Scan for licenses and calculate hashes for all Files and Packages
185    in this Document.
186
187    Arguments:
188        - cfg: ScannerConfig
189        - doc: Document
190    """
191    for pkg in doc.pkgs.values():
192        log.inf(f"scanning files in package {pkg.cfg.name} in document {doc.cfg.name}")
193
194        # first, gather File data for this package
195        for f in pkg.files.values():
196            # set relpath based on package's relativeBaseDir
197            f.relpath = os.path.relpath(f.abspath, pkg.cfg.relativeBaseDir)
198
199            # get hashes for file
200            hashes = getHashes(f.abspath)
201            if not hashes:
202                log.wrn(f"unable to get hashes for file {f.abspath}; skipping")
203                continue
204            hSHA1, hSHA256, hMD5 = hashes
205            f.sha1 = hSHA1
206            if cfg.doSHA256:
207                f.sha256 = hSHA256
208            if cfg.doMD5:
209                f.md5 = hMD5
210
211            # get licenses for file
212            expression = getExpressionData(f.abspath, cfg.numLinesScanned)
213            if expression:
214                if cfg.shouldConcludeFileLicenses:
215                    f.concludedLicense = expression
216                f.licenseInfoInFile = splitExpression(expression)
217
218            # check if any custom license IDs should be flagged for document
219            for lic in f.licenseInfoInFile:
220                checkLicenseValid(lic, doc)
221
222        # now, assemble the Package data
223        licsConcluded, licsFromFiles = getPackageLicenses(pkg)
224        if cfg.shouldConcludePackageLicense:
225            pkg.concludedLicense = normalizeExpression(licsConcluded)
226        pkg.licenseInfoFromFiles = licsFromFiles
227        pkg.verificationCode = calculateVerificationCode(pkg)
228