1#!/usr/bin/env python3
2#
3# Copyright (c) 2020 Intel Corporation
4#
5# SPDX-License-Identifier: Apache-2.0
6
7"""
8Dictionary-based Logging Database Generator
9
10This takes the built Zephyr ELF binary and produces a JSON database
11file for dictionary-based logging. This database is used together
12with the parser to decode binary log messages.
13"""
14
15import argparse
16import logging
17import os
18import re
19import string
20import struct
21import sys
22
23import dictionary_parser.log_database
24from dictionary_parser.log_database import LogDatabase
25from dictionary_parser.utils import extract_one_string_in_section
26from dictionary_parser.utils import find_string_in_mappings
27
28import elftools
29from elftools.elf.constants import SH_FLAGS
30from elftools.elf.elffile import ELFFile
31from elftools.elf.descriptions import describe_ei_data
32from elftools.elf.sections import SymbolTableSection
33from elftools.dwarf.descriptions import (
34    describe_DWARF_expr
35)
36from elftools.dwarf.locationlists import (
37    LocationExpr, LocationParser
38)
39
40
41LOGGER_FORMAT = "%(name)s: %(levelname)s: %(message)s"
42logger = logging.getLogger(os.path.basename(sys.argv[0]))
43
44
45# Sections that contains static strings
46STATIC_STRING_SECTIONS = [
47    'rodata',
48    '.rodata',
49    'pinned.rodata',
50]
51
52
53# Regulation expression to match DWARF location
54DT_LOCATION_REGEX = re.compile(r"\(DW_OP_addr: ([0-9a-f]+)")
55
56
57# Format string for pointers (default for 32-bit pointers)
58PTR_FMT = '0x%08x'
59
60
61# Potential string encodings. Add as needed.
62STR_ENCODINGS = [
63    'ascii',
64    'iso-8859-1',
65]
66
67
68# List of acceptable escape character
69ACCEPTABLE_ESCAPE_CHARS = [
70    b'\r',
71    b'\n',
72]
73
74
75def parse_args():
76    """Parse command line arguments"""
77    argparser = argparse.ArgumentParser(allow_abbrev=False)
78
79    argparser.add_argument("elffile", help="Zephyr ELF binary")
80    argparser.add_argument("--build", help="Build ID")
81    argparser.add_argument("--build-header",
82                           help="Header file containing BUILD_VERSION define")
83    argparser.add_argument("--debug", action="store_true",
84                           help="Print extra debugging information")
85    argparser.add_argument("-v", "--verbose", action="store_true",
86                           help="Print more information")
87
88    outfile_grp = argparser.add_mutually_exclusive_group(required=True)
89    outfile_grp.add_argument("--json",
90                             help="Output Dictionary Logging Database file in JSON")
91    outfile_grp.add_argument("--syst",
92                             help="Output MIPI Sys-T Collateral XML file")
93
94    return argparser.parse_args()
95
96
97def extract_elf_code_data_sections(elf):
98    """Find all sections in ELF file"""
99    sections = {}
100
101    for sect in elf.iter_sections():
102        # Only Allocated sections with PROGBITS are included
103        # since they actually have code/data.
104        #
105        # On contrary, BSS is allocated but NOBITS.
106        if (
107            (sect['sh_flags'] & SH_FLAGS.SHF_ALLOC) == SH_FLAGS.SHF_ALLOC
108            and sect['sh_type'] == 'SHT_PROGBITS'
109        ):
110            sections[sect.name] = {
111                    'name'    : sect.name,
112                    'size'    : sect['sh_size'],
113                    'start'   : sect['sh_addr'],
114                    'end'     : sect['sh_addr'] + sect['sh_size'] - 1,
115                    'data'    : sect.data(),
116                }
117
118    return sections
119
120
121def find_elf_sections(elf, sh_name):
122    """Find all sections in ELF file"""
123    for section in elf.iter_sections():
124        if section.name == sh_name:
125            ret = {
126                'name'    : section.name,
127                'size'    : section['sh_size'],
128                'start'   : section['sh_addr'],
129                'end'     : section['sh_addr'] + section['sh_size'] - 1,
130                'data'    : section.data(),
131            }
132
133            return ret
134
135    return None
136
137
138def get_kconfig_symbols(elf):
139    """Get kconfig symbols from the ELF file"""
140    for section in elf.iter_sections():
141        if isinstance(section, SymbolTableSection):
142            return {sym.name: sym.entry.st_value
143                    for sym in section.iter_symbols()
144                       if sym.name.startswith("CONFIG_")}
145
146    raise LookupError("Could not find symbol table")
147
148
149def find_log_const_symbols(elf):
150    """Extract all "log_const_*" symbols from ELF file"""
151    symbol_tables = [s for s in elf.iter_sections()
152                     if isinstance(s, elftools.elf.sections.SymbolTableSection)]
153
154    ret_list = []
155
156    for section in symbol_tables:
157        if not isinstance(section, elftools.elf.sections.SymbolTableSection):
158            continue
159
160        if section['sh_entsize'] == 0:
161            continue
162
163        for symbol in section.iter_symbols():
164            if symbol.name.startswith("log_const_"):
165                ret_list.append(symbol)
166
167    return ret_list
168
169
170def parse_log_const_symbols(database, log_const_area, log_const_symbols, string_mappings):
171    """Find the log instances and map source IDs to names"""
172    if database.is_tgt_little_endian():
173        formatter = "<"
174    else:
175        formatter = ">"
176
177    if database.is_tgt_64bit():
178        # 64-bit pointer to string
179        formatter += "Q"
180    else:
181        # 32-bit pointer to string
182        formatter += "L"
183
184    # log instance level
185    formatter += "B"
186
187    datum_size = struct.calcsize(formatter)
188
189    # Get the address of first log instance
190    first_offset = log_const_symbols[0].entry['st_value']
191    for sym in log_const_symbols:
192        if sym.entry['st_value'] < first_offset:
193            first_offset = sym.entry['st_value']
194
195    first_offset -= log_const_area['start']
196
197    # find all log_const_*
198    for sym in log_const_symbols:
199        # Find data offset in log_const_area for this symbol
200        offset = sym.entry['st_value'] - log_const_area['start']
201
202        idx_s = offset
203        idx_e = offset + datum_size
204
205        datum = log_const_area['data'][idx_s:idx_e]
206
207        if len(datum) != datum_size:
208            # Not enough data to unpack
209            continue
210
211        str_ptr, level = struct.unpack(formatter, datum)
212
213        # Offset to rodata section for string
214        instance_name = find_string_in_mappings(string_mappings, str_ptr)
215        if instance_name is None:
216            instance_name = "unknown"
217
218        logger.info("Found Log Instance: %s, level: %d", instance_name, level)
219
220        # source ID is simply the element index in the log instance array
221        source_id = int((offset - first_offset) / sym.entry['st_size'])
222
223        database.add_log_instance(source_id, instance_name, level, sym.entry['st_value'])
224
225
226def extract_elf_information(elf, database):
227    """Extract information from ELF file and store in database"""
228    e_ident = elf.header['e_ident']
229    elf_data = describe_ei_data(e_ident['EI_DATA'])
230
231    if elf_data == elftools.elf.descriptions._DESCR_EI_DATA['ELFDATA2LSB']:
232        database.set_tgt_endianness(LogDatabase.LITTLE_ENDIAN)
233    elif elf_data == elftools.elf.descriptions._DESCR_EI_DATA['ELFDATA2MSB']:
234        database.set_tgt_endianness(LogDatabase.BIG_ENDIAN)
235    else:
236        logger.error("Cannot determine endianness from ELF file, exiting...")
237        sys.exit(1)
238
239
240def process_kconfigs(elf, database):
241    """Process kconfigs to extract information"""
242    kconfigs = get_kconfig_symbols(elf)
243
244    # 32 or 64-bit target
245    database.set_tgt_bits(64 if "CONFIG_64BIT" in kconfigs else 32)
246
247    # Architecture
248    for name, arch in dictionary_parser.log_database.ARCHS.items():
249        if arch['kconfig'] in kconfigs:
250            database.set_arch(name)
251            break
252
253    # Put some kconfigs into the database
254    #
255    # Use 32-bit timestamp? or 64-bit?
256    if "CONFIG_LOG_TIMESTAMP_64BIT" in kconfigs:
257        database.add_kconfig("CONFIG_LOG_TIMESTAMP_64BIT",
258                             kconfigs['CONFIG_LOG_TIMESTAMP_64BIT'])
259
260
261def extract_logging_subsys_information(elf, database, string_mappings):
262    """
263    Extract logging subsys related information and store in database.
264
265    For example, this extracts the list of log instances to establish
266    mapping from source ID to name.
267    """
268    # Extract log constant section for module names
269    section_log_const = find_elf_sections(elf, "log_const_area")
270    if section_log_const is None:
271        # ESP32 puts "log_const_*" info log_static_section instead of log_const_areas
272        section_log_const = find_elf_sections(elf, "log_static_section")
273
274    if section_log_const is None:
275        logger.error("Cannot find section 'log_const_areas' in ELF file, exiting...")
276        sys.exit(1)
277
278    # Find all "log_const_*" symbols and parse them
279    log_const_symbols = find_log_const_symbols(elf)
280    parse_log_const_symbols(database, section_log_const, log_const_symbols, string_mappings)
281
282
283def is_die_attr_ref(attr):
284    """
285    Returns True if the DIE attribute is a reference.
286    """
287    return bool(attr.form in ('DW_FORM_ref1', 'DW_FORM_ref2',
288                              'DW_FORM_ref4', 'DW_FORM_ref8',
289                              'DW_FORM_ref'))
290
291
292def find_die_var_base_type(compile_unit, die, is_const):
293    """
294    Finds the base type of a DIE and returns the name.
295    If DW_AT_type is a reference, it will recursively go through
296    the references to find the base type. Returns None is no
297    base type is found.
298    """
299    # DIE is of base type. So extract the name.
300    if die.tag == 'DW_TAG_base_type':
301        return die.attributes['DW_AT_name'].value.decode('ascii'), is_const
302
303    # Not a type, cannot continue
304    if not 'DW_AT_type' in die.attributes:
305        return None, None
306
307    if die.tag == 'DW_TAG_const_type':
308        is_const = True
309
310    # DIE is probably a reference to another.
311    # If so, check if the reference is a base type.
312    type_attr = die.attributes['DW_AT_type']
313    if is_die_attr_ref(type_attr):
314        ref_addr = compile_unit.cu_offset + type_attr.raw_value
315        ref_die = compile_unit.get_DIE_from_refaddr(ref_addr)
316
317        return find_die_var_base_type(compile_unit, ref_die, is_const)
318
319    # Not a base type, and not reference
320    return None, None
321
322
323def is_die_var_const_char(compile_unit, die):
324    """
325    Returns True if DIE of type variable is const char.
326    """
327    var_type, is_const = find_die_var_base_type(compile_unit, die, False)
328
329    if var_type is not None and var_type.endswith('char') and is_const:
330        return True
331
332    return False
333
334
335def extract_string_variables(elf):
336    """
337    Find all string variables (char) in all Compilation Units and
338    Debug information Entry (DIE) in ELF file.
339    """
340    dwarf_info = elf.get_dwarf_info()
341    loc_lists = dwarf_info.location_lists()
342    loc_parser = LocationParser(loc_lists)
343
344    strings = []
345
346    # Loop through all Compilation Units and
347    # Debug information Entry (DIE) to extract all string variables
348    for compile_unit in dwarf_info.iter_CUs():
349        for die in compile_unit.iter_DIEs():
350            # Only care about variables with location information
351            # and of type "char"
352            if die.tag == 'DW_TAG_variable':
353                if ('DW_AT_type' in die.attributes
354                    and 'DW_AT_location' in die.attributes
355                    and is_die_var_const_char(compile_unit, die)
356                ):
357                    # Extract location information, which is
358                    # its address in memory.
359                    loc_attr = die.attributes['DW_AT_location']
360                    if loc_parser.attribute_has_location(loc_attr, die.cu['version']):
361                        loc = loc_parser.parse_from_attribute(loc_attr, die.cu['version'])
362                        if isinstance(loc, LocationExpr):
363                            try:
364                                addr = describe_DWARF_expr(loc.loc_expr,
365                                                        dwarf_info.structs)
366
367                                matcher = DT_LOCATION_REGEX.match(addr)
368                                if matcher:
369                                    addr = int(matcher.group(1), 16)
370                                    if addr > 0:
371                                        strings.append({
372                                            'name': die.attributes['DW_AT_name'].value,
373                                            'addr': addr,
374                                            'die': die
375                                        })
376                            except KeyError:
377                                pass
378
379    return strings
380
381
382def try_decode_string(str_maybe):
383    """Check if it is a printable string"""
384    for encoding in STR_ENCODINGS:
385        try:
386            decoded_str = str_maybe.decode(encoding)
387
388            # Check if string is printable according to Python
389            # since the parser (written in Python) will need to
390            # print the string.
391            #
392            # Note that '\r' and '\n' are not included in
393            # string.printable so they need to be checked separately.
394            printable = True
395            for one_char in decoded_str:
396                if (one_char not in string.printable
397                    and one_char not in ACCEPTABLE_ESCAPE_CHARS):
398                    printable = False
399                    break
400
401            if printable:
402                return decoded_str
403        except UnicodeDecodeError:
404            pass
405
406    return None
407
408
409def extract_strings_in_one_section(section, str_mappings):
410    """Extract NULL-terminated strings in one ELF section"""
411    bindata = section['data']
412
413    if len(bindata) < 2:
414        # Can't have a NULL-terminated string with fewer than 2 bytes.
415        return str_mappings
416
417    idx = 0
418
419    # If first byte is not NULL, it may be a string.
420    if bindata[0] == 0:
421        start = None
422    else:
423        start = 0
424
425    while idx < len(bindata):
426        if start is None:
427            if bindata[idx] == 0:
428                # Skip NULL bytes to find next string
429                idx += 1
430            else:
431                # Beginning of possible string
432                start = idx
433                idx += 1
434        else:
435            if bindata[idx] != 0:
436                # Skipping till next NULL byte for possible string
437                idx += 1
438            else:
439                # End of possible string
440                end = idx
441
442                if start != end:
443                    str_maybe = bindata[start:end]
444                    decoded_str = try_decode_string(str_maybe)
445
446                    # Only store readable string
447                    if decoded_str is not None:
448                        addr = section['start'] + start
449
450                        if addr not in str_mappings:
451                            str_mappings[addr] = decoded_str
452
453                            # Decoded string may contain un-printable characters
454                            # (e.g. extended ASC-II characters) or control
455                            # characters (e.g. '\r' or '\n'), so simply print
456                            # the byte string instead.
457                            logger.debug('Found string via extraction at ' + PTR_FMT + ': %s',
458                                         addr, str_maybe)
459
460                            # GCC-based toolchain will reuse the NULL character
461                            # for empty strings. There is no way to know which
462                            # one is being reused, so just treat all NULL character
463                            # at the end of legitimate strings as empty strings.
464                            null_addr = section['start'] + end
465                            str_mappings[null_addr] = ''
466
467                            logger.debug('Found null string via extraction at ' + PTR_FMT,
468                                         null_addr)
469
470                start = None
471                idx += 1
472
473    return str_mappings
474
475
476def extract_static_strings(elf, database, section_extraction=False):
477    """
478    Extract static strings from ELF file using DWARF,
479    and also extraction from binary data.
480    """
481    string_mappings = {}
482
483    elf_sections = extract_elf_code_data_sections(elf)
484
485    # Extract strings using ELF DWARF information
486    str_vars = extract_string_variables(elf)
487    for str_var in str_vars:
488        for _, sect in elf_sections.items():
489            one_str = extract_one_string_in_section(sect, str_var['addr'])
490            if one_str is not None:
491                string_mappings[str_var['addr']] = one_str
492                logger.debug('Found string variable at ' + PTR_FMT + ': %s',
493                             str_var['addr'], one_str)
494                break
495
496    if section_extraction:
497        # Extract strings from ELF sections
498        string_sections = STATIC_STRING_SECTIONS
499        rawstr_map = {}
500
501        # Some architectures may put static strings into additional sections.
502        # So need to extract them too.
503        arch_data = dictionary_parser.log_database.ARCHS[database.get_arch()]
504        if "extra_string_section" in arch_data:
505            string_sections.extend(arch_data['extra_string_section'])
506
507        for sect_name in string_sections:
508            if sect_name in elf_sections:
509                rawstr_map = extract_strings_in_one_section(elf_sections[sect_name],
510                                                                rawstr_map)
511
512        for one_str in rawstr_map:
513            if one_str not in string_mappings:
514                string_mappings[one_str] = rawstr_map[one_str]
515
516    return string_mappings
517
518
519def main():
520    """Main function of database generator"""
521    args = parse_args()
522
523    # Setup logging
524    logging.basicConfig(format=LOGGER_FORMAT, level=logging.WARNING)
525    if args.debug:
526        logger.setLevel(logging.DEBUG)
527    elif args.verbose:
528        logger.setLevel(logging.INFO)
529
530    elffile = open(args.elffile, "rb")
531    if not elffile:
532        logger.error("ERROR: Cannot open ELF file: %s, exiting...", args.elffile)
533        sys.exit(1)
534
535    logger.info("ELF file %s", args.elffile)
536
537    if args.json:
538        logger.info("JSON Database file %s", args.json)
539        section_extraction = True
540
541    if args.syst:
542        logger.info("MIPI Sys-T Collateral file %s", args.syst)
543        section_extraction = False
544
545    elf = ELFFile(elffile)
546
547    database = LogDatabase()
548
549    if args.build_header:
550        with open(args.build_header) as f:
551            for l in f:
552                match = re.match(r'\s*#define\s+BUILD_VERSION\s+(.*)', l)
553                if match:
554                    database.set_build_id(match.group(1))
555                    break
556
557    if args.build:
558        database.set_build_id(args.build)
559        logger.info("Build ID: %s", args.build)
560
561    extract_elf_information(elf, database)
562
563    process_kconfigs(elf, database)
564
565    logger.info("Target: %s, %d-bit", database.get_arch(), database.get_tgt_bits())
566    if database.is_tgt_little_endian():
567        logger.info("Endianness: Little")
568    else:
569        logger.info("Endianness: Big")
570
571    if database.is_tgt_64bit():
572        global PTR_FMT
573        PTR_FMT = '0x%016x'
574
575    # Extract strings from ELF files
576    string_mappings = extract_static_strings(elf, database, section_extraction)
577    if len(string_mappings) > 0:
578        database.set_string_mappings(string_mappings)
579        logger.info("Found %d strings", len(string_mappings))
580
581    # Extract information related to logging subsystem
582    if not section_extraction:
583        # The logging subsys information (e.g. log module names)
584        # may require additional strings outside of those extracted
585        # via ELF DWARF variables. So generate a new string mappings
586        # with strings in various ELF sections.
587        string_mappings = extract_static_strings(elf, database, section_extraction=True)
588
589    extract_logging_subsys_information(elf, database, string_mappings)
590
591    # Write database file
592    if args.json:
593        if not LogDatabase.write_json_database(args.json, database):
594            logger.error("ERROR: Cannot open database file for write: %s, exiting...",
595                         args.json)
596            sys.exit(1)
597
598    if args.syst:
599        if not LogDatabase.write_syst_database(args.syst, database):
600            logger.error("ERROR: Cannot open database file for write: %s, exiting...",
601                         args.syst)
602            sys.exit(1)
603
604    elffile.close()
605
606
607if __name__ == "__main__":
608    main()
609