1#!/usr/bin/env python3
2#
3# Copyright (c) 2020 Intel Corporation
4#
5# SPDX-License-Identifier: Apache-2.0
6
7"""
8Dictionary-based Logging Database Generator
9
10This takes the built Zephyr ELF binary and produces a JSON database
11file for dictionary-based logging. This database is used together
12with the parser to decode binary log messages.
13"""
14
15import argparse
16import logging
17import os
18import re
19import string
20import struct
21import sys
22
23import dictionary_parser.log_database
24from dictionary_parser.log_database import LogDatabase
25from dictionary_parser.utils import extract_one_string_in_section
26from dictionary_parser.utils import find_string_in_mappings
27
28import elftools
29from elftools.elf.constants import SH_FLAGS
30from elftools.elf.elffile import ELFFile
31from elftools.elf.descriptions import describe_ei_data
32from elftools.elf.sections import SymbolTableSection
33from elftools.dwarf.descriptions import (
34    describe_DWARF_expr
35)
36from elftools.dwarf.locationlists import (
37    LocationExpr, LocationParser
38)
39
40
41LOGGER_FORMAT = "%(name)s: %(levelname)s: %(message)s"
42logger = logging.getLogger(os.path.basename(sys.argv[0]))
43
44
45# Sections that contains static strings
46STATIC_STRING_SECTIONS = [
47    'rodata',
48    '.rodata',
49    'pinned.rodata',
50]
51
52# Sections that contains static strings but are not part of the binary (allocable).
53REMOVED_STRING_SECTIONS = [
54    'log_strings'
55]
56
57
58# Regulation expression to match DWARF location
59DT_LOCATION_REGEX = re.compile(r"\(DW_OP_addr: ([0-9a-f]+)")
60
61
62# Format string for pointers (default for 32-bit pointers)
63PTR_FMT = '0x%08x'
64
65
66# Potential string encodings. Add as needed.
67STR_ENCODINGS = [
68    'ascii',
69    'iso-8859-1',
70]
71
72
73# List of acceptable escape character
74ACCEPTABLE_ESCAPE_CHARS = [
75    b'\r',
76    b'\n',
77]
78
79
80def parse_args():
81    """Parse command line arguments"""
82    argparser = argparse.ArgumentParser(allow_abbrev=False)
83
84    argparser.add_argument("elffile", help="Zephyr ELF binary")
85    argparser.add_argument("--build", help="Build ID")
86    argparser.add_argument("--build-header",
87                           help="Header file containing BUILD_VERSION define")
88    argparser.add_argument("--debug", action="store_true",
89                           help="Print extra debugging information")
90    argparser.add_argument("-v", "--verbose", action="store_true",
91                           help="Print more information")
92
93    outfile_grp = argparser.add_mutually_exclusive_group(required=True)
94    outfile_grp.add_argument("--json",
95                             help="Output Dictionary Logging Database file in JSON")
96    outfile_grp.add_argument("--syst",
97                             help="Output MIPI Sys-T Collateral XML file")
98
99    return argparser.parse_args()
100
101
102def extract_elf_code_data_sections(elf, wildcards = None):
103    """Find all sections in ELF file"""
104    sections = {}
105
106    for sect in elf.iter_sections():
107        # Only Allocated sections with PROGBITS are included
108        # since they actually have code/data.
109        #
110        # On contrary, BSS is allocated but NOBITS.
111        if (((wildcards is not None) and (sect.name in wildcards)) or
112            ((sect['sh_flags'] & SH_FLAGS.SHF_ALLOC) == SH_FLAGS.SHF_ALLOC
113            and sect['sh_type'] == 'SHT_PROGBITS')
114        ):
115            sections[sect.name] = {
116                    'name'    : sect.name,
117                    'size'    : sect['sh_size'],
118                    'start'   : sect['sh_addr'],
119                    'end'     : sect['sh_addr'] + sect['sh_size'] - 1,
120                    'data'    : sect.data(),
121                }
122
123    return sections
124
125
126def find_elf_sections(elf, sh_name):
127    """Find all sections in ELF file"""
128    for section in elf.iter_sections():
129        if section.name == sh_name:
130            ret = {
131                'name'    : section.name,
132                'size'    : section['sh_size'],
133                'start'   : section['sh_addr'],
134                'end'     : section['sh_addr'] + section['sh_size'] - 1,
135                'data'    : section.data(),
136            }
137
138            return ret
139
140    return None
141
142
143def get_kconfig_symbols(elf):
144    """Get kconfig symbols from the ELF file"""
145    for section in elf.iter_sections():
146        if isinstance(section, SymbolTableSection) and section['sh_type'] != 'SHT_DYNSYM':
147            return {sym.name: sym.entry.st_value
148                    for sym in section.iter_symbols()
149                       if sym.name.startswith("CONFIG_")}
150
151    raise LookupError("Could not find symbol table")
152
153
154def find_log_const_symbols(elf):
155    """Extract all "log_const_*" symbols from ELF file"""
156    symbol_tables = [s for s in elf.iter_sections()
157                     if isinstance(s, elftools.elf.sections.SymbolTableSection)]
158
159    ret_list = []
160
161    for section in symbol_tables:
162        if not isinstance(section, elftools.elf.sections.SymbolTableSection):
163            continue
164
165        if section['sh_entsize'] == 0:
166            continue
167
168        for symbol in section.iter_symbols():
169            if symbol.name.startswith("log_const_"):
170                ret_list.append(symbol)
171
172    return ret_list
173
174
175def parse_log_const_symbols(database, log_const_area, log_const_symbols, string_mappings):
176    """Find the log instances and map source IDs to names"""
177    if database.is_tgt_little_endian():
178        formatter = "<"
179    else:
180        formatter = ">"
181
182    if database.is_tgt_64bit():
183        # 64-bit pointer to string
184        formatter += "Q"
185    else:
186        # 32-bit pointer to string
187        formatter += "L"
188
189    # log instance level
190    formatter += "B"
191
192    datum_size = struct.calcsize(formatter)
193
194    # Get the address of first log instance
195    first_offset = log_const_symbols[0].entry['st_value']
196    for sym in log_const_symbols:
197        if sym.entry['st_value'] < first_offset:
198            first_offset = sym.entry['st_value']
199
200    first_offset -= log_const_area['start']
201
202    # find all log_const_*
203    for sym in log_const_symbols:
204        # Find data offset in log_const_area for this symbol
205        offset = sym.entry['st_value'] - log_const_area['start']
206
207        idx_s = offset
208        idx_e = offset + datum_size
209
210        datum = log_const_area['data'][idx_s:idx_e]
211
212        if len(datum) != datum_size:
213            # Not enough data to unpack
214            continue
215
216        str_ptr, level = struct.unpack(formatter, datum)
217
218        # Offset to rodata section for string
219        instance_name = find_string_in_mappings(string_mappings, str_ptr)
220        if instance_name is None:
221            instance_name = "unknown"
222
223        logger.info("Found Log Instance: %s, level: %d", instance_name, level)
224
225        # source ID is simply the element index in the log instance array
226        source_id = int((offset - first_offset) / sym.entry['st_size'])
227
228        database.add_log_instance(source_id, instance_name, level, sym.entry['st_value'])
229
230
231def extract_elf_information(elf, database):
232    """Extract information from ELF file and store in database"""
233    e_ident = elf.header['e_ident']
234    elf_data = describe_ei_data(e_ident['EI_DATA'])
235
236    if elf_data == elftools.elf.descriptions._DESCR_EI_DATA['ELFDATA2LSB']:
237        database.set_tgt_endianness(LogDatabase.LITTLE_ENDIAN)
238    elif elf_data == elftools.elf.descriptions._DESCR_EI_DATA['ELFDATA2MSB']:
239        database.set_tgt_endianness(LogDatabase.BIG_ENDIAN)
240    else:
241        logger.error("Cannot determine endianness from ELF file, exiting...")
242        sys.exit(1)
243
244
245def process_kconfigs(elf, database):
246    """Process kconfigs to extract information"""
247    kconfigs = get_kconfig_symbols(elf)
248
249    # 32 or 64-bit target
250    database.set_tgt_bits(64 if "CONFIG_64BIT" in kconfigs else 32)
251
252    # Architecture
253    for name, arch in dictionary_parser.log_database.ARCHS.items():
254        if arch['kconfig'] in kconfigs:
255            database.set_arch(name)
256            break
257    else:
258        logger.error("Did not found architecture")
259        sys.exit(1)
260
261    # Put some kconfigs into the database
262    #
263    # Use 32-bit timestamp? or 64-bit?
264    if "CONFIG_LOG_TIMESTAMP_64BIT" in kconfigs:
265        database.add_kconfig("CONFIG_LOG_TIMESTAMP_64BIT",
266                             kconfigs['CONFIG_LOG_TIMESTAMP_64BIT'])
267
268
269def extract_logging_subsys_information(elf, database, string_mappings):
270    """
271    Extract logging subsys related information and store in database.
272
273    For example, this extracts the list of log instances to establish
274    mapping from source ID to name.
275    """
276    # Extract log constant section for module names
277    section_log_const = find_elf_sections(elf, "log_const_area")
278    if section_log_const is None:
279        # ESP32 puts "log_const_*" info log_static_section instead of log_const_areas
280        section_log_const = find_elf_sections(elf, "log_static_section")
281
282    if section_log_const is None:
283        logger.error("Cannot find section 'log_const_areas' in ELF file, exiting...")
284        sys.exit(1)
285
286    # Find all "log_const_*" symbols and parse them
287    log_const_symbols = find_log_const_symbols(elf)
288    parse_log_const_symbols(database, section_log_const, log_const_symbols, string_mappings)
289
290
291def is_die_attr_ref(attr):
292    """
293    Returns True if the DIE attribute is a reference.
294    """
295    return bool(attr.form in ('DW_FORM_ref1', 'DW_FORM_ref2',
296                              'DW_FORM_ref4', 'DW_FORM_ref8',
297                              'DW_FORM_ref'))
298
299
300def find_die_var_base_type(compile_unit, die, is_const):
301    """
302    Finds the base type of a DIE and returns the name.
303    If DW_AT_type is a reference, it will recursively go through
304    the references to find the base type. Returns None is no
305    base type is found.
306    """
307    # DIE is of base type. So extract the name.
308    if die.tag == 'DW_TAG_base_type':
309        return die.attributes['DW_AT_name'].value.decode('ascii'), is_const
310
311    # Not a type, cannot continue
312    if not 'DW_AT_type' in die.attributes:
313        return None, None
314
315    if die.tag == 'DW_TAG_const_type':
316        is_const = True
317
318    # DIE is probably a reference to another.
319    # If so, check if the reference is a base type.
320    type_attr = die.attributes['DW_AT_type']
321    if is_die_attr_ref(type_attr):
322        ref_addr = compile_unit.cu_offset + type_attr.raw_value
323        ref_die = compile_unit.get_DIE_from_refaddr(ref_addr)
324
325        return find_die_var_base_type(compile_unit, ref_die, is_const)
326
327    # Not a base type, and not reference
328    return None, None
329
330
331def is_die_var_const_char(compile_unit, die):
332    """
333    Returns True if DIE of type variable is const char.
334    """
335    var_type, is_const = find_die_var_base_type(compile_unit, die, False)
336
337    if var_type is not None and var_type.endswith('char') and is_const:
338        return True
339
340    return False
341
342
343def extract_string_variables(elf):
344    """
345    Find all string variables (char) in all Compilation Units and
346    Debug information Entry (DIE) in ELF file.
347    """
348    dwarf_info = elf.get_dwarf_info()
349    loc_lists = dwarf_info.location_lists()
350    loc_parser = LocationParser(loc_lists)
351
352    strings = []
353
354    # Loop through all Compilation Units and
355    # Debug information Entry (DIE) to extract all string variables
356    for compile_unit in dwarf_info.iter_CUs():
357        for die in compile_unit.iter_DIEs():
358            # Only care about variables with location information
359            # and of type "char"
360            if die.tag == 'DW_TAG_variable':
361                if ('DW_AT_type' in die.attributes
362                    and 'DW_AT_location' in die.attributes
363                    and is_die_var_const_char(compile_unit, die)
364                ):
365                    # Extract location information, which is
366                    # its address in memory.
367                    loc_attr = die.attributes['DW_AT_location']
368                    if loc_parser.attribute_has_location(loc_attr, die.cu['version']):
369                        loc = loc_parser.parse_from_attribute(loc_attr, die.cu['version'])
370                        if isinstance(loc, LocationExpr):
371                            try:
372                                addr = describe_DWARF_expr(loc.loc_expr,
373                                                        dwarf_info.structs)
374
375                                matcher = DT_LOCATION_REGEX.match(addr)
376                                if matcher:
377                                    addr = int(matcher.group(1), 16)
378                                    if addr > 0:
379                                        strings.append({
380                                            'name': die.attributes['DW_AT_name'].value,
381                                            'addr': addr,
382                                            'die': die
383                                        })
384                            except KeyError:
385                                pass
386
387    return strings
388
389def try_decode_string(str_maybe):
390    """Check if it is a printable string"""
391    for encoding in STR_ENCODINGS:
392        try:
393            return str_maybe.decode(encoding)
394        except UnicodeDecodeError:
395            pass
396
397    return None
398
399def is_printable(b):
400    # Check if string is printable according to Python
401    # since the parser (written in Python) will need to
402    # print the string.
403    #
404    # Note that '\r' and '\n' are not included in
405    # string.printable so they need to be checked separately.
406    return (b in string.printable) or (b in ACCEPTABLE_ESCAPE_CHARS)
407
408def extract_strings_in_one_section(section, str_mappings):
409    """Extract NULL-terminated strings in one ELF section"""
410    data = section['data']
411    idx = 0
412    start = None
413    for x in data:
414        if is_printable(chr(x)):
415            # Printable character, potential part of string
416            if start is None:
417                # Beginning of potential string
418                start = idx
419        elif x == 0:
420            # End of possible string
421            if start is not None:
422                # Found potential string
423                str_maybe = data[start : idx]
424                decoded_str = try_decode_string(str_maybe)
425
426                if decoded_str is not None:
427                    addr = section['start'] + start
428
429                    if addr not in str_mappings:
430                        str_mappings[addr] = decoded_str
431
432                        # Decoded string may contain un-printable characters
433                        # (e.g. extended ASC-II characters) or control
434                        # characters (e.g. '\r' or '\n'), so simply print
435                        # the byte string instead.
436                        logger.debug('Found string via extraction at ' + PTR_FMT + ': %s',
437                                     addr, str_maybe)
438
439                        # GCC-based toolchain will reuse the NULL character
440                        # for empty strings. There is no way to know which
441                        # one is being reused, so just treat all NULL character
442                        # at the end of legitimate strings as empty strings.
443                        null_addr = section['start'] + idx
444                        str_mappings[null_addr] = ''
445
446                        logger.debug('Found null string via extraction at ' + PTR_FMT,
447                                     null_addr)
448                start = None
449        else:
450            # Non-printable byte, remove start location
451            start = None
452        idx += 1
453
454    return str_mappings
455
456
457def extract_static_strings(elf, database, section_extraction=False):
458    """
459    Extract static strings from ELF file using DWARF,
460    and also extraction from binary data.
461    """
462    string_mappings = {}
463
464    elf_sections = extract_elf_code_data_sections(elf, REMOVED_STRING_SECTIONS)
465
466    # Extract strings using ELF DWARF information
467    str_vars = extract_string_variables(elf)
468    for str_var in str_vars:
469        for _, sect in elf_sections.items():
470            one_str = extract_one_string_in_section(sect, str_var['addr'])
471            if one_str is not None:
472                string_mappings[str_var['addr']] = one_str
473                logger.debug('Found string variable at ' + PTR_FMT + ': %s',
474                             str_var['addr'], one_str)
475                break
476
477    if section_extraction:
478        # Extract strings from ELF sections
479        string_sections = STATIC_STRING_SECTIONS
480        rawstr_map = {}
481
482        # Some architectures may put static strings into additional sections.
483        # So need to extract them too.
484        arch_data = dictionary_parser.log_database.ARCHS[database.get_arch()]
485        if "extra_string_section" in arch_data:
486            string_sections.extend(arch_data['extra_string_section'])
487
488        for sect_name in string_sections:
489            if sect_name in elf_sections:
490                rawstr_map = extract_strings_in_one_section(elf_sections[sect_name],
491                                                                rawstr_map)
492
493        for one_str in rawstr_map:
494            if one_str not in string_mappings:
495                string_mappings[one_str] = rawstr_map[one_str]
496
497    return string_mappings
498
499
500def main():
501    """Main function of database generator"""
502    args = parse_args()
503
504    # Setup logging
505    logging.basicConfig(format=LOGGER_FORMAT, level=logging.WARNING)
506    if args.debug:
507        logger.setLevel(logging.DEBUG)
508    elif args.verbose:
509        logger.setLevel(logging.INFO)
510
511    elffile = open(args.elffile, "rb")
512    if not elffile:
513        logger.error("ERROR: Cannot open ELF file: %s, exiting...", args.elffile)
514        sys.exit(1)
515
516    logger.info("ELF file %s", args.elffile)
517
518    if args.json:
519        logger.info("JSON Database file %s", args.json)
520        section_extraction = True
521
522    if args.syst:
523        logger.info("MIPI Sys-T Collateral file %s", args.syst)
524        section_extraction = False
525
526    elf = ELFFile(elffile)
527
528    database = LogDatabase()
529
530    if args.build_header:
531        with open(args.build_header) as f:
532            for l in f:
533                match = re.match(r'\s*#define\s+BUILD_VERSION\s+(.*)', l)
534                if match:
535                    database.set_build_id(match.group(1))
536                    break
537
538    if args.build:
539        database.set_build_id(args.build)
540        logger.info("Build ID: %s", args.build)
541
542    extract_elf_information(elf, database)
543
544    process_kconfigs(elf, database)
545
546    logger.info("Target: %s, %d-bit", database.get_arch(), database.get_tgt_bits())
547    if database.is_tgt_little_endian():
548        logger.info("Endianness: Little")
549    else:
550        logger.info("Endianness: Big")
551
552    if database.is_tgt_64bit():
553        global PTR_FMT
554        PTR_FMT = '0x%016x'
555
556    # Extract strings from ELF files
557    string_mappings = extract_static_strings(elf, database, section_extraction)
558    if len(string_mappings) > 0:
559        database.set_string_mappings(string_mappings)
560        logger.info("Found %d strings", len(string_mappings))
561
562    # Extract information related to logging subsystem
563    if not section_extraction:
564        # The logging subsys information (e.g. log module names)
565        # may require additional strings outside of those extracted
566        # via ELF DWARF variables. So generate a new string mappings
567        # with strings in various ELF sections.
568        string_mappings = extract_static_strings(elf, database, section_extraction=True)
569
570    extract_logging_subsys_information(elf, database, string_mappings)
571
572    # Write database file
573    if args.json:
574        if not LogDatabase.write_json_database(args.json, database):
575            logger.error("ERROR: Cannot open database file for write: %s, exiting...",
576                         args.json)
577            sys.exit(1)
578
579    if args.syst:
580        if not LogDatabase.write_syst_database(args.syst, database):
581            logger.error("ERROR: Cannot open database file for write: %s, exiting...",
582                         args.syst)
583            sys.exit(1)
584
585    elffile.close()
586
587
588if __name__ == "__main__":
589    main()
590