1#!/usr/bin/env python3
2#
3# Copyright (c) 2020 Intel Corporation
4#
5# SPDX-License-Identifier: Apache-2.0
6
7"""
8Dictionary-based Logging Database Generator
9
10This takes the built Zephyr ELF binary and produces a JSON database
11file for dictionary-based logging. This database is used together
12with the parser to decode binary log messages.
13"""
14
15import argparse
16import logging
17import os
18import re
19import string
20import struct
21import sys
22
23import dictionary_parser.log_database
24from dictionary_parser.log_database import LogDatabase
25from dictionary_parser.utils import extract_one_string_in_section
26from dictionary_parser.utils import find_string_in_mappings
27
28import elftools
29from elftools.elf.constants import SH_FLAGS
30from elftools.elf.elffile import ELFFile
31from elftools.elf.descriptions import describe_ei_data
32from elftools.elf.sections import SymbolTableSection
33from elftools.dwarf.descriptions import (
34    describe_DWARF_expr
35)
36from elftools.dwarf.locationlists import (
37    LocationExpr, LocationParser
38)
39
40
41LOGGER_FORMAT = "%(name)s: %(levelname)s: %(message)s"
42logger = logging.getLogger(os.path.basename(sys.argv[0]))
43
44
45# Sections that contains static strings
46STATIC_STRING_SECTIONS = [
47    'rodata',
48    '.rodata',
49    'pinned.rodata',
50]
51
52# Sections that contains static strings but are not part of the binary (allocable).
53REMOVED_STRING_SECTIONS = [
54    'log_strings'
55]
56
57
58# Regulation expression to match DWARF location
59DT_LOCATION_REGEX = re.compile(r"\(DW_OP_addr: ([0-9a-f]+)")
60
61
62# Format string for pointers (default for 32-bit pointers)
63PTR_FMT = '0x%08x'
64
65
66# Potential string encodings. Add as needed.
67STR_ENCODINGS = [
68    'ascii',
69    'iso-8859-1',
70]
71
72
73# List of acceptable escape character
74ACCEPTABLE_ESCAPE_CHARS = [
75    b'\r',
76    b'\n',
77]
78
79
80def parse_args():
81    """Parse command line arguments"""
82    argparser = argparse.ArgumentParser(allow_abbrev=False)
83
84    argparser.add_argument("elffile", help="Zephyr ELF binary")
85    argparser.add_argument("--build", help="Build ID")
86    argparser.add_argument("--build-header",
87                           help="Header file containing BUILD_VERSION define")
88    argparser.add_argument("--debug", action="store_true",
89                           help="Print extra debugging information")
90    argparser.add_argument("-v", "--verbose", action="store_true",
91                           help="Print more information")
92
93    outfile_grp = argparser.add_mutually_exclusive_group(required=True)
94    outfile_grp.add_argument("--json",
95                             help="Output Dictionary Logging Database file in JSON")
96    outfile_grp.add_argument("--syst",
97                             help="Output MIPI Sys-T Collateral XML file")
98
99    return argparser.parse_args()
100
101
102def extract_elf_code_data_sections(elf, wildcards = None):
103    """Find all sections in ELF file"""
104    sections = {}
105
106    for sect in elf.iter_sections():
107        # Only Allocated sections with PROGBITS are included
108        # since they actually have code/data.
109        #
110        # On contrary, BSS is allocated but NOBITS.
111        if (((wildcards is not None) and (sect.name in wildcards)) or
112            ((sect['sh_flags'] & SH_FLAGS.SHF_ALLOC) == SH_FLAGS.SHF_ALLOC
113            and sect['sh_type'] == 'SHT_PROGBITS')
114        ):
115            sections[sect.name] = {
116                    'name'    : sect.name,
117                    'size'    : sect['sh_size'],
118                    'start'   : sect['sh_addr'],
119                    'end'     : sect['sh_addr'] + sect['sh_size'] - 1,
120                    'data'    : sect.data(),
121                }
122
123    return sections
124
125
126def find_elf_sections(elf, sh_name):
127    """Find all sections in ELF file"""
128    for section in elf.iter_sections():
129        if section.name == sh_name:
130            ret = {
131                'name'    : section.name,
132                'size'    : section['sh_size'],
133                'start'   : section['sh_addr'],
134                'end'     : section['sh_addr'] + section['sh_size'] - 1,
135                'data'    : section.data(),
136            }
137
138            return ret
139
140    return None
141
142
143def get_kconfig_symbols(elf):
144    """Get kconfig symbols from the ELF file"""
145    for section in elf.iter_sections():
146        if isinstance(section, SymbolTableSection):
147            return {sym.name: sym.entry.st_value
148                    for sym in section.iter_symbols()
149                       if sym.name.startswith("CONFIG_")}
150
151    raise LookupError("Could not find symbol table")
152
153
154def find_log_const_symbols(elf):
155    """Extract all "log_const_*" symbols from ELF file"""
156    symbol_tables = [s for s in elf.iter_sections()
157                     if isinstance(s, elftools.elf.sections.SymbolTableSection)]
158
159    ret_list = []
160
161    for section in symbol_tables:
162        if not isinstance(section, elftools.elf.sections.SymbolTableSection):
163            continue
164
165        if section['sh_entsize'] == 0:
166            continue
167
168        for symbol in section.iter_symbols():
169            if symbol.name.startswith("log_const_"):
170                ret_list.append(symbol)
171
172    return ret_list
173
174
175def parse_log_const_symbols(database, log_const_area, log_const_symbols, string_mappings):
176    """Find the log instances and map source IDs to names"""
177    if database.is_tgt_little_endian():
178        formatter = "<"
179    else:
180        formatter = ">"
181
182    if database.is_tgt_64bit():
183        # 64-bit pointer to string
184        formatter += "Q"
185    else:
186        # 32-bit pointer to string
187        formatter += "L"
188
189    # log instance level
190    formatter += "B"
191
192    datum_size = struct.calcsize(formatter)
193
194    # Get the address of first log instance
195    first_offset = log_const_symbols[0].entry['st_value']
196    for sym in log_const_symbols:
197        if sym.entry['st_value'] < first_offset:
198            first_offset = sym.entry['st_value']
199
200    first_offset -= log_const_area['start']
201
202    # find all log_const_*
203    for sym in log_const_symbols:
204        # Find data offset in log_const_area for this symbol
205        offset = sym.entry['st_value'] - log_const_area['start']
206
207        idx_s = offset
208        idx_e = offset + datum_size
209
210        datum = log_const_area['data'][idx_s:idx_e]
211
212        if len(datum) != datum_size:
213            # Not enough data to unpack
214            continue
215
216        str_ptr, level = struct.unpack(formatter, datum)
217
218        # Offset to rodata section for string
219        instance_name = find_string_in_mappings(string_mappings, str_ptr)
220        if instance_name is None:
221            instance_name = "unknown"
222
223        logger.info("Found Log Instance: %s, level: %d", instance_name, level)
224
225        # source ID is simply the element index in the log instance array
226        source_id = int((offset - first_offset) / sym.entry['st_size'])
227
228        database.add_log_instance(source_id, instance_name, level, sym.entry['st_value'])
229
230
231def extract_elf_information(elf, database):
232    """Extract information from ELF file and store in database"""
233    e_ident = elf.header['e_ident']
234    elf_data = describe_ei_data(e_ident['EI_DATA'])
235
236    if elf_data == elftools.elf.descriptions._DESCR_EI_DATA['ELFDATA2LSB']:
237        database.set_tgt_endianness(LogDatabase.LITTLE_ENDIAN)
238    elif elf_data == elftools.elf.descriptions._DESCR_EI_DATA['ELFDATA2MSB']:
239        database.set_tgt_endianness(LogDatabase.BIG_ENDIAN)
240    else:
241        logger.error("Cannot determine endianness from ELF file, exiting...")
242        sys.exit(1)
243
244
245def process_kconfigs(elf, database):
246    """Process kconfigs to extract information"""
247    kconfigs = get_kconfig_symbols(elf)
248
249    # 32 or 64-bit target
250    database.set_tgt_bits(64 if "CONFIG_64BIT" in kconfigs else 32)
251
252    # Architecture
253    for name, arch in dictionary_parser.log_database.ARCHS.items():
254        if arch['kconfig'] in kconfigs:
255            database.set_arch(name)
256            break
257
258    # Put some kconfigs into the database
259    #
260    # Use 32-bit timestamp? or 64-bit?
261    if "CONFIG_LOG_TIMESTAMP_64BIT" in kconfigs:
262        database.add_kconfig("CONFIG_LOG_TIMESTAMP_64BIT",
263                             kconfigs['CONFIG_LOG_TIMESTAMP_64BIT'])
264
265
266def extract_logging_subsys_information(elf, database, string_mappings):
267    """
268    Extract logging subsys related information and store in database.
269
270    For example, this extracts the list of log instances to establish
271    mapping from source ID to name.
272    """
273    # Extract log constant section for module names
274    section_log_const = find_elf_sections(elf, "log_const_area")
275    if section_log_const is None:
276        # ESP32 puts "log_const_*" info log_static_section instead of log_const_areas
277        section_log_const = find_elf_sections(elf, "log_static_section")
278
279    if section_log_const is None:
280        logger.error("Cannot find section 'log_const_areas' in ELF file, exiting...")
281        sys.exit(1)
282
283    # Find all "log_const_*" symbols and parse them
284    log_const_symbols = find_log_const_symbols(elf)
285    parse_log_const_symbols(database, section_log_const, log_const_symbols, string_mappings)
286
287
288def is_die_attr_ref(attr):
289    """
290    Returns True if the DIE attribute is a reference.
291    """
292    return bool(attr.form in ('DW_FORM_ref1', 'DW_FORM_ref2',
293                              'DW_FORM_ref4', 'DW_FORM_ref8',
294                              'DW_FORM_ref'))
295
296
297def find_die_var_base_type(compile_unit, die, is_const):
298    """
299    Finds the base type of a DIE and returns the name.
300    If DW_AT_type is a reference, it will recursively go through
301    the references to find the base type. Returns None is no
302    base type is found.
303    """
304    # DIE is of base type. So extract the name.
305    if die.tag == 'DW_TAG_base_type':
306        return die.attributes['DW_AT_name'].value.decode('ascii'), is_const
307
308    # Not a type, cannot continue
309    if not 'DW_AT_type' in die.attributes:
310        return None, None
311
312    if die.tag == 'DW_TAG_const_type':
313        is_const = True
314
315    # DIE is probably a reference to another.
316    # If so, check if the reference is a base type.
317    type_attr = die.attributes['DW_AT_type']
318    if is_die_attr_ref(type_attr):
319        ref_addr = compile_unit.cu_offset + type_attr.raw_value
320        ref_die = compile_unit.get_DIE_from_refaddr(ref_addr)
321
322        return find_die_var_base_type(compile_unit, ref_die, is_const)
323
324    # Not a base type, and not reference
325    return None, None
326
327
328def is_die_var_const_char(compile_unit, die):
329    """
330    Returns True if DIE of type variable is const char.
331    """
332    var_type, is_const = find_die_var_base_type(compile_unit, die, False)
333
334    if var_type is not None and var_type.endswith('char') and is_const:
335        return True
336
337    return False
338
339
340def extract_string_variables(elf):
341    """
342    Find all string variables (char) in all Compilation Units and
343    Debug information Entry (DIE) in ELF file.
344    """
345    dwarf_info = elf.get_dwarf_info()
346    loc_lists = dwarf_info.location_lists()
347    loc_parser = LocationParser(loc_lists)
348
349    strings = []
350
351    # Loop through all Compilation Units and
352    # Debug information Entry (DIE) to extract all string variables
353    for compile_unit in dwarf_info.iter_CUs():
354        for die in compile_unit.iter_DIEs():
355            # Only care about variables with location information
356            # and of type "char"
357            if die.tag == 'DW_TAG_variable':
358                if ('DW_AT_type' in die.attributes
359                    and 'DW_AT_location' in die.attributes
360                    and is_die_var_const_char(compile_unit, die)
361                ):
362                    # Extract location information, which is
363                    # its address in memory.
364                    loc_attr = die.attributes['DW_AT_location']
365                    if loc_parser.attribute_has_location(loc_attr, die.cu['version']):
366                        loc = loc_parser.parse_from_attribute(loc_attr, die.cu['version'])
367                        if isinstance(loc, LocationExpr):
368                            try:
369                                addr = describe_DWARF_expr(loc.loc_expr,
370                                                        dwarf_info.structs)
371
372                                matcher = DT_LOCATION_REGEX.match(addr)
373                                if matcher:
374                                    addr = int(matcher.group(1), 16)
375                                    if addr > 0:
376                                        strings.append({
377                                            'name': die.attributes['DW_AT_name'].value,
378                                            'addr': addr,
379                                            'die': die
380                                        })
381                            except KeyError:
382                                pass
383
384    return strings
385
386def try_decode_string(str_maybe):
387    """Check if it is a printable string"""
388    for encoding in STR_ENCODINGS:
389        try:
390            return str_maybe.decode(encoding)
391        except UnicodeDecodeError:
392            pass
393
394    return None
395
396def is_printable(b):
397    # Check if string is printable according to Python
398    # since the parser (written in Python) will need to
399    # print the string.
400    #
401    # Note that '\r' and '\n' are not included in
402    # string.printable so they need to be checked separately.
403    return (b in string.printable) or (b in ACCEPTABLE_ESCAPE_CHARS)
404
405def extract_strings_in_one_section(section, str_mappings):
406    """Extract NULL-terminated strings in one ELF section"""
407    data = section['data']
408    idx = 0
409    start = None
410    for x in data:
411        if is_printable(chr(x)):
412            # Printable character, potential part of string
413            if start is None:
414                # Beginning of potential string
415                start = idx
416        elif x == 0:
417            # End of possible string
418            if start is not None:
419                # Found potential string
420                str_maybe = data[start : idx]
421                decoded_str = try_decode_string(str_maybe)
422
423                if decoded_str is not None:
424                    addr = section['start'] + start
425
426                    if addr not in str_mappings:
427                        str_mappings[addr] = decoded_str
428
429                        # Decoded string may contain un-printable characters
430                        # (e.g. extended ASC-II characters) or control
431                        # characters (e.g. '\r' or '\n'), so simply print
432                        # the byte string instead.
433                        logger.debug('Found string via extraction at ' + PTR_FMT + ': %s',
434                                     addr, str_maybe)
435
436                        # GCC-based toolchain will reuse the NULL character
437                        # for empty strings. There is no way to know which
438                        # one is being reused, so just treat all NULL character
439                        # at the end of legitimate strings as empty strings.
440                        null_addr = section['start'] + idx
441                        str_mappings[null_addr] = ''
442
443                        logger.debug('Found null string via extraction at ' + PTR_FMT,
444                                     null_addr)
445                start = None
446        else:
447            # Non-printable byte, remove start location
448            start = None
449        idx += 1
450
451    return str_mappings
452
453
454def extract_static_strings(elf, database, section_extraction=False):
455    """
456    Extract static strings from ELF file using DWARF,
457    and also extraction from binary data.
458    """
459    string_mappings = {}
460
461    elf_sections = extract_elf_code_data_sections(elf, REMOVED_STRING_SECTIONS)
462
463    # Extract strings using ELF DWARF information
464    str_vars = extract_string_variables(elf)
465    for str_var in str_vars:
466        for _, sect in elf_sections.items():
467            one_str = extract_one_string_in_section(sect, str_var['addr'])
468            if one_str is not None:
469                string_mappings[str_var['addr']] = one_str
470                logger.debug('Found string variable at ' + PTR_FMT + ': %s',
471                             str_var['addr'], one_str)
472                break
473
474    if section_extraction:
475        # Extract strings from ELF sections
476        string_sections = STATIC_STRING_SECTIONS
477        rawstr_map = {}
478
479        # Some architectures may put static strings into additional sections.
480        # So need to extract them too.
481        arch_data = dictionary_parser.log_database.ARCHS[database.get_arch()]
482        if "extra_string_section" in arch_data:
483            string_sections.extend(arch_data['extra_string_section'])
484
485        for sect_name in string_sections:
486            if sect_name in elf_sections:
487                rawstr_map = extract_strings_in_one_section(elf_sections[sect_name],
488                                                                rawstr_map)
489
490        for one_str in rawstr_map:
491            if one_str not in string_mappings:
492                string_mappings[one_str] = rawstr_map[one_str]
493
494    return string_mappings
495
496
497def main():
498    """Main function of database generator"""
499    args = parse_args()
500
501    # Setup logging
502    logging.basicConfig(format=LOGGER_FORMAT, level=logging.WARNING)
503    if args.debug:
504        logger.setLevel(logging.DEBUG)
505    elif args.verbose:
506        logger.setLevel(logging.INFO)
507
508    elffile = open(args.elffile, "rb")
509    if not elffile:
510        logger.error("ERROR: Cannot open ELF file: %s, exiting...", args.elffile)
511        sys.exit(1)
512
513    logger.info("ELF file %s", args.elffile)
514
515    if args.json:
516        logger.info("JSON Database file %s", args.json)
517        section_extraction = True
518
519    if args.syst:
520        logger.info("MIPI Sys-T Collateral file %s", args.syst)
521        section_extraction = False
522
523    elf = ELFFile(elffile)
524
525    database = LogDatabase()
526
527    if args.build_header:
528        with open(args.build_header) as f:
529            for l in f:
530                match = re.match(r'\s*#define\s+BUILD_VERSION\s+(.*)', l)
531                if match:
532                    database.set_build_id(match.group(1))
533                    break
534
535    if args.build:
536        database.set_build_id(args.build)
537        logger.info("Build ID: %s", args.build)
538
539    extract_elf_information(elf, database)
540
541    process_kconfigs(elf, database)
542
543    logger.info("Target: %s, %d-bit", database.get_arch(), database.get_tgt_bits())
544    if database.is_tgt_little_endian():
545        logger.info("Endianness: Little")
546    else:
547        logger.info("Endianness: Big")
548
549    if database.is_tgt_64bit():
550        global PTR_FMT
551        PTR_FMT = '0x%016x'
552
553    # Extract strings from ELF files
554    string_mappings = extract_static_strings(elf, database, section_extraction)
555    if len(string_mappings) > 0:
556        database.set_string_mappings(string_mappings)
557        logger.info("Found %d strings", len(string_mappings))
558
559    # Extract information related to logging subsystem
560    if not section_extraction:
561        # The logging subsys information (e.g. log module names)
562        # may require additional strings outside of those extracted
563        # via ELF DWARF variables. So generate a new string mappings
564        # with strings in various ELF sections.
565        string_mappings = extract_static_strings(elf, database, section_extraction=True)
566
567    extract_logging_subsys_information(elf, database, string_mappings)
568
569    # Write database file
570    if args.json:
571        if not LogDatabase.write_json_database(args.json, database):
572            logger.error("ERROR: Cannot open database file for write: %s, exiting...",
573                         args.json)
574            sys.exit(1)
575
576    if args.syst:
577        if not LogDatabase.write_syst_database(args.syst, database):
578            logger.error("ERROR: Cannot open database file for write: %s, exiting...",
579                         args.syst)
580            sys.exit(1)
581
582    elffile.close()
583
584
585if __name__ == "__main__":
586    main()
587