1# Copyright (c) 2020, 2021 The Linux Foundation 2# 3# SPDX-License-Identifier: Apache-2.0 4 5import hashlib 6import os 7import re 8 9from west import log 10 11from zspdx.licenses import LICENSES 12from zspdx.util import getHashes 13 14 15# ScannerConfig contains settings used to configure how the SPDX 16# Document scanning should occur. 17class ScannerConfig: 18 def __init__(self): 19 super(ScannerConfig, self).__init__() 20 21 # when assembling a Package's data, should we auto-conclude the 22 # Package's license, based on the licenses of its Files? 23 self.shouldConcludePackageLicense = True 24 25 # when assembling a Package's Files' data, should we auto-conclude 26 # each File's license, based on its detected license(s)? 27 self.shouldConcludeFileLicenses = True 28 29 # number of lines to scan for SPDX-License-Identifier (0 = all) 30 # defaults to 20 31 self.numLinesScanned = 20 32 33 # should we calculate SHA256 hashes for each Package's Files? 34 # note that SHA1 hashes are mandatory, per SPDX 2.3 35 self.doSHA256 = True 36 37 # should we calculate MD5 hashes for each Package's Files? 38 self.doMD5 = False 39 40 41def parseLineForExpression(line): 42 """Return parsed SPDX expression if tag found in line, or None otherwise.""" 43 p = line.partition("SPDX-License-Identifier:") 44 if p[2] == "": 45 return None 46 # strip away trailing comment marks and whitespace, if any 47 expression = p[2].strip() 48 expression = expression.rstrip("/*") 49 expression = expression.strip() 50 return expression 51 52 53def getExpressionData(filePath, numLines): 54 """ 55 Scans the specified file for the first SPDX-License-Identifier: 56 tag in the file. 57 58 Arguments: 59 - filePath: path to file to scan. 60 - numLines: number of lines to scan for an expression before 61 giving up. If 0, will scan the entire file. 62 Returns: parsed expression if found; None if not found. 63 """ 64 log.dbg(f" - getting licenses for {filePath}") 65 66 with open(filePath, "r") as f: 67 try: 68 lineno = 0 69 for line in f: 70 lineno += 1 71 if lineno > numLines > 0: 72 break 73 expression = parseLineForExpression(line) 74 if expression is not None: 75 return expression 76 except UnicodeDecodeError: 77 # invalid UTF-8 content 78 return None 79 80 # if we get here, we didn't find an expression 81 return None 82 83 84def splitExpression(expression): 85 """ 86 Parse a license expression into its constituent identifiers. 87 88 Arguments: 89 - expression: SPDX license expression 90 Returns: array of split identifiers 91 """ 92 # remove parens and plus sign 93 e2 = re.sub(r'\(|\)|\+', "", expression, flags=re.IGNORECASE) 94 95 # remove word operators, ignoring case, leaving a blank space 96 e3 = re.sub(r' AND | OR | WITH ', " ", e2, flags=re.IGNORECASE) 97 98 # and split on space 99 e4 = e3.split(" ") 100 101 return sorted(e4) 102 103 104def calculateVerificationCode(pkg): 105 """ 106 Calculate the SPDX Package Verification Code for all files in the package. 107 108 Arguments: 109 - pkg: Package 110 Returns: verification code as string 111 """ 112 hashes = [] 113 for f in pkg.files.values(): 114 hashes.append(f.sha1) 115 hashes.sort() 116 filelist = "".join(hashes) 117 118 hSHA1 = hashlib.sha1() 119 hSHA1.update(filelist.encode('utf-8')) 120 return hSHA1.hexdigest() 121 122 123def checkLicenseValid(lic, doc): 124 """ 125 Check whether this license ID is a valid SPDX license ID, and add it 126 to the custom license IDs set for this Document if it isn't. 127 128 Arguments: 129 - lic: detected license ID 130 - doc: Document 131 """ 132 if lic not in LICENSES: 133 doc.customLicenseIDs.add(lic) 134 135 136def getPackageLicenses(pkg): 137 """ 138 Extract lists of all concluded and infoInFile licenses seen. 139 140 Arguments: 141 - pkg: Package 142 Returns: sorted list of concluded license exprs, 143 sorted list of infoInFile ID's 144 """ 145 licsConcluded = set() 146 licsFromFiles = set() 147 for f in pkg.files.values(): 148 licsConcluded.add(f.concludedLicense) 149 for licInfo in f.licenseInfoInFile: 150 licsFromFiles.add(licInfo) 151 return sorted(list(licsConcluded)), sorted(list(licsFromFiles)) 152 153 154def normalizeExpression(licsConcluded): 155 """ 156 Combine array of license expressions into one AND'd expression, 157 adding parens where needed. 158 159 Arguments: 160 - licsConcluded: array of license expressions 161 Returns: string with single AND'd expression. 162 """ 163 # return appropriate for simple cases 164 if len(licsConcluded) == 0: 165 return "NOASSERTION" 166 if len(licsConcluded) == 1: 167 return licsConcluded[0] 168 169 # more than one, so we'll need to combine them 170 # if and only if an expression has spaces, it needs parens 171 revised = [] 172 for lic in licsConcluded: 173 if lic in ["NONE", "NOASSERTION"]: 174 continue 175 if " " in lic: 176 revised.append(f"({lic})") 177 else: 178 revised.append(lic) 179 return " AND ".join(revised) 180 181 182def scanDocument(cfg, doc): 183 """ 184 Scan for licenses and calculate hashes for all Files and Packages 185 in this Document. 186 187 Arguments: 188 - cfg: ScannerConfig 189 - doc: Document 190 """ 191 for pkg in doc.pkgs.values(): 192 log.inf(f"scanning files in package {pkg.cfg.name} in document {doc.cfg.name}") 193 194 # first, gather File data for this package 195 for f in pkg.files.values(): 196 # set relpath based on package's relativeBaseDir 197 f.relpath = os.path.relpath(f.abspath, pkg.cfg.relativeBaseDir) 198 199 # get hashes for file 200 hashes = getHashes(f.abspath) 201 if not hashes: 202 log.wrn(f"unable to get hashes for file {f.abspath}; skipping") 203 continue 204 hSHA1, hSHA256, hMD5 = hashes 205 f.sha1 = hSHA1 206 if cfg.doSHA256: 207 f.sha256 = hSHA256 208 if cfg.doMD5: 209 f.md5 = hMD5 210 211 # get licenses for file 212 expression = getExpressionData(f.abspath, cfg.numLinesScanned) 213 if expression: 214 if cfg.shouldConcludeFileLicenses: 215 f.concludedLicense = expression 216 f.licenseInfoInFile = splitExpression(expression) 217 218 # check if any custom license IDs should be flagged for document 219 for lic in f.licenseInfoInFile: 220 checkLicenseValid(lic, doc) 221 222 # now, assemble the Package data 223 licsConcluded, licsFromFiles = getPackageLicenses(pkg) 224 if cfg.shouldConcludePackageLicense: 225 pkg.concludedLicense = normalizeExpression(licsConcluded) 226 pkg.licenseInfoFromFiles = licsFromFiles 227 pkg.verificationCode = calculateVerificationCode(pkg) 228