1#!/usr/bin/env python3 2# 3# Script to aggregate and report Linux perf results. 4# 5# Example: 6# ./scripts/perf.py -R -obench.perf ./runners/bench_runner 7# ./scripts/perf.py bench.perf -j -Flfs.c -Flfs_util.c -Scycles 8# 9# Copyright (c) 2022, The littlefs authors. 10# SPDX-License-Identifier: BSD-3-Clause 11# 12 13import bisect 14import collections as co 15import csv 16import errno 17import fcntl 18import functools as ft 19import itertools as it 20import math as m 21import multiprocessing as mp 22import os 23import re 24import shlex 25import shutil 26import subprocess as sp 27import tempfile 28import zipfile 29 30# TODO support non-zip perf results? 31 32 33PERF_PATH = ['perf'] 34PERF_EVENTS = 'cycles,branch-misses,branches,cache-misses,cache-references' 35PERF_FREQ = 100 36OBJDUMP_PATH = ['objdump'] 37THRESHOLD = (0.5, 0.85) 38 39 40# integer fields 41class Int(co.namedtuple('Int', 'x')): 42 __slots__ = () 43 def __new__(cls, x=0): 44 if isinstance(x, Int): 45 return x 46 if isinstance(x, str): 47 try: 48 x = int(x, 0) 49 except ValueError: 50 # also accept +-∞ and +-inf 51 if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x): 52 x = m.inf 53 elif re.match('^\s*-\s*(?:∞|inf)\s*$', x): 54 x = -m.inf 55 else: 56 raise 57 assert isinstance(x, int) or m.isinf(x), x 58 return super().__new__(cls, x) 59 60 def __str__(self): 61 if self.x == m.inf: 62 return '∞' 63 elif self.x == -m.inf: 64 return '-∞' 65 else: 66 return str(self.x) 67 68 def __int__(self): 69 assert not m.isinf(self.x) 70 return self.x 71 72 def __float__(self): 73 return float(self.x) 74 75 none = '%7s' % '-' 76 def table(self): 77 return '%7s' % (self,) 78 79 diff_none = '%7s' % '-' 80 diff_table = table 81 82 def diff_diff(self, other): 83 new = self.x if self else 0 84 old = other.x if other else 0 85 diff = new - old 86 if diff == +m.inf: 87 return '%7s' % '+∞' 88 elif diff == -m.inf: 89 return '%7s' % '-∞' 90 else: 91 return '%+7d' % diff 92 93 def ratio(self, other): 94 new = self.x if self else 0 95 old = other.x if other else 0 96 if m.isinf(new) and m.isinf(old): 97 return 0.0 98 elif m.isinf(new): 99 return +m.inf 100 elif m.isinf(old): 101 return -m.inf 102 elif not old and not new: 103 return 0.0 104 elif not old: 105 return 1.0 106 else: 107 return (new-old) / old 108 109 def __add__(self, other): 110 return self.__class__(self.x + other.x) 111 112 def __sub__(self, other): 113 return self.__class__(self.x - other.x) 114 115 def __mul__(self, other): 116 return self.__class__(self.x * other.x) 117 118# perf results 119class PerfResult(co.namedtuple('PerfResult', [ 120 'file', 'function', 'line', 121 'cycles', 'bmisses', 'branches', 'cmisses', 'caches', 122 'children'])): 123 _by = ['file', 'function', 'line'] 124 _fields = ['cycles', 'bmisses', 'branches', 'cmisses', 'caches'] 125 _sort = ['cycles', 'bmisses', 'cmisses', 'branches', 'caches'] 126 _types = { 127 'cycles': Int, 128 'bmisses': Int, 'branches': Int, 129 'cmisses': Int, 'caches': Int} 130 131 __slots__ = () 132 def __new__(cls, file='', function='', line=0, 133 cycles=0, bmisses=0, branches=0, cmisses=0, caches=0, 134 children=[]): 135 return super().__new__(cls, file, function, int(Int(line)), 136 Int(cycles), Int(bmisses), Int(branches), Int(cmisses), Int(caches), 137 children) 138 139 def __add__(self, other): 140 return PerfResult(self.file, self.function, self.line, 141 self.cycles + other.cycles, 142 self.bmisses + other.bmisses, 143 self.branches + other.branches, 144 self.cmisses + other.cmisses, 145 self.caches + other.caches, 146 self.children + other.children) 147 148 149def openio(path, mode='r', buffering=-1): 150 # allow '-' for stdin/stdout 151 if path == '-': 152 if mode == 'r': 153 return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering) 154 else: 155 return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering) 156 else: 157 return open(path, mode, buffering) 158 159# run perf as a subprocess, storing measurements into a zip file 160def record(command, *, 161 output=None, 162 perf_freq=PERF_FREQ, 163 perf_period=None, 164 perf_events=PERF_EVENTS, 165 perf_path=PERF_PATH, 166 **args): 167 # create a temporary file for perf to write to, as far as I can tell 168 # this is strictly needed because perf's pipe-mode only works with stdout 169 with tempfile.NamedTemporaryFile('rb') as f: 170 # figure out our perf invocation 171 perf = perf_path + list(filter(None, [ 172 'record', 173 '-F%s' % perf_freq 174 if perf_freq is not None 175 and perf_period is None else None, 176 '-c%s' % perf_period 177 if perf_period is not None else None, 178 '-B', 179 '-g', 180 '--all-user', 181 '-e%s' % perf_events, 182 '-o%s' % f.name])) 183 184 # run our command 185 try: 186 if args.get('verbose'): 187 print(' '.join(shlex.quote(c) for c in perf + command)) 188 err = sp.call(perf + command, close_fds=False) 189 190 except KeyboardInterrupt: 191 err = errno.EOWNERDEAD 192 193 # synchronize access 194 z = os.open(output, os.O_RDWR | os.O_CREAT) 195 fcntl.flock(z, fcntl.LOCK_EX) 196 197 # copy measurements into our zip file 198 with os.fdopen(z, 'r+b') as z: 199 with zipfile.ZipFile(z, 'a', 200 compression=zipfile.ZIP_DEFLATED, 201 compresslevel=1) as z: 202 with z.open('perf.%d' % os.getpid(), 'w') as g: 203 shutil.copyfileobj(f, g) 204 205 # forward the return code 206 return err 207 208 209# try to only process each dso onceS 210# 211# note this only caches with the non-keyword arguments 212def multiprocessing_cache(f): 213 local_cache = {} 214 manager = mp.Manager() 215 global_cache = manager.dict() 216 lock = mp.Lock() 217 218 def multiprocessing_cache(*args, **kwargs): 219 # check local cache? 220 if args in local_cache: 221 return local_cache[args] 222 # check global cache? 223 with lock: 224 if args in global_cache: 225 v = global_cache[args] 226 local_cache[args] = v 227 return v 228 # fall back to calling the function 229 v = f(*args, **kwargs) 230 global_cache[args] = v 231 local_cache[args] = v 232 return v 233 234 return multiprocessing_cache 235 236@multiprocessing_cache 237def collect_syms_and_lines(obj_path, *, 238 objdump_path=None, 239 **args): 240 symbol_pattern = re.compile( 241 '^(?P<addr>[0-9a-fA-F]+)' 242 '\s+.*' 243 '\s+(?P<size>[0-9a-fA-F]+)' 244 '\s+(?P<name>[^\s]+)\s*$') 245 line_pattern = re.compile( 246 '^\s+(?:' 247 # matches dir/file table 248 '(?P<no>[0-9]+)' 249 '(?:\s+(?P<dir>[0-9]+))?' 250 '\s+.*' 251 '\s+(?P<path>[^\s]+)' 252 # matches line opcodes 253 '|' '\[[^\]]*\]\s+' 254 '(?:' 255 '(?P<op_special>Special)' 256 '|' '(?P<op_copy>Copy)' 257 '|' '(?P<op_end>End of Sequence)' 258 '|' 'File .*?to (?:entry )?(?P<op_file>\d+)' 259 '|' 'Line .*?to (?P<op_line>[0-9]+)' 260 '|' '(?:Address|PC) .*?to (?P<op_addr>[0x0-9a-fA-F]+)' 261 '|' '.' ')*' 262 ')$', re.IGNORECASE) 263 264 # figure out symbol addresses and file+line ranges 265 syms = {} 266 sym_at = [] 267 cmd = objdump_path + ['-t', obj_path] 268 if args.get('verbose'): 269 print(' '.join(shlex.quote(c) for c in cmd)) 270 proc = sp.Popen(cmd, 271 stdout=sp.PIPE, 272 stderr=sp.PIPE if not args.get('verbose') else None, 273 universal_newlines=True, 274 errors='replace', 275 close_fds=False) 276 for line in proc.stdout: 277 m = symbol_pattern.match(line) 278 if m: 279 name = m.group('name') 280 addr = int(m.group('addr'), 16) 281 size = int(m.group('size'), 16) 282 # ignore zero-sized symbols 283 if not size: 284 continue 285 # note multiple symbols can share a name 286 if name not in syms: 287 syms[name] = set() 288 syms[name].add((addr, size)) 289 sym_at.append((addr, name, size)) 290 proc.wait() 291 if proc.returncode != 0: 292 if not args.get('verbose'): 293 for line in proc.stderr: 294 sys.stdout.write(line) 295 # assume no debug-info on failure 296 pass 297 298 # sort and keep largest/first when duplicates 299 sym_at.sort(key=lambda x: (x[0], -x[2], x[1])) 300 sym_at_ = [] 301 for addr, name, size in sym_at: 302 if len(sym_at_) == 0 or sym_at_[-1][0] != addr: 303 sym_at_.append((addr, name, size)) 304 sym_at = sym_at_ 305 306 # state machine for dwarf line numbers, note that objdump's 307 # decodedline seems to have issues with multiple dir/file 308 # tables, which is why we need this 309 lines = [] 310 line_at = [] 311 dirs = {} 312 files = {} 313 op_file = 1 314 op_line = 1 315 op_addr = 0 316 cmd = objdump_path + ['--dwarf=rawline', obj_path] 317 if args.get('verbose'): 318 print(' '.join(shlex.quote(c) for c in cmd)) 319 proc = sp.Popen(cmd, 320 stdout=sp.PIPE, 321 stderr=sp.PIPE if not args.get('verbose') else None, 322 universal_newlines=True, 323 errors='replace', 324 close_fds=False) 325 for line in proc.stdout: 326 m = line_pattern.match(line) 327 if m: 328 if m.group('no') and not m.group('dir'): 329 # found a directory entry 330 dirs[int(m.group('no'))] = m.group('path') 331 elif m.group('no'): 332 # found a file entry 333 dir = int(m.group('dir')) 334 if dir in dirs: 335 files[int(m.group('no'))] = os.path.join( 336 dirs[dir], 337 m.group('path')) 338 else: 339 files[int(m.group('no'))] = m.group('path') 340 else: 341 # found a state machine update 342 if m.group('op_file'): 343 op_file = int(m.group('op_file'), 0) 344 if m.group('op_line'): 345 op_line = int(m.group('op_line'), 0) 346 if m.group('op_addr'): 347 op_addr = int(m.group('op_addr'), 0) 348 349 if (m.group('op_special') 350 or m.group('op_copy') 351 or m.group('op_end')): 352 file = os.path.abspath(files.get(op_file, '?')) 353 lines.append((file, op_line, op_addr)) 354 line_at.append((op_addr, file, op_line)) 355 356 if m.group('op_end'): 357 op_file = 1 358 op_line = 1 359 op_addr = 0 360 proc.wait() 361 if proc.returncode != 0: 362 if not args.get('verbose'): 363 for line in proc.stderr: 364 sys.stdout.write(line) 365 # assume no debug-info on failure 366 pass 367 368 # sort and keep first when duplicates 369 lines.sort() 370 lines_ = [] 371 for file, line, addr in lines: 372 if len(lines_) == 0 or lines_[-1][0] != file or lines[-1][1] != line: 373 lines_.append((file, line, addr)) 374 lines = lines_ 375 376 # sort and keep first when duplicates 377 line_at.sort() 378 line_at_ = [] 379 for addr, file, line in line_at: 380 if len(line_at_) == 0 or line_at_[-1][0] != addr: 381 line_at_.append((addr, file, line)) 382 line_at = line_at_ 383 384 return syms, sym_at, lines, line_at 385 386 387def collect_decompressed(path, *, 388 perf_path=PERF_PATH, 389 sources=None, 390 everything=False, 391 propagate=0, 392 depth=1, 393 **args): 394 sample_pattern = re.compile( 395 '(?P<comm>\w+)' 396 '\s+(?P<pid>\w+)' 397 '\s+(?P<time>[\w.]+):' 398 '\s*(?P<period>\w+)' 399 '\s+(?P<event>[^:]+):') 400 frame_pattern = re.compile( 401 '\s+(?P<addr>\w+)' 402 '\s+(?P<sym>[^\s\+]+)(?:\+(?P<off>\w+))?' 403 '\s+\((?P<dso>[^\)]+)\)') 404 events = { 405 'cycles': 'cycles', 406 'branch-misses': 'bmisses', 407 'branches': 'branches', 408 'cache-misses': 'cmisses', 409 'cache-references': 'caches'} 410 411 # note perf_path may contain extra args 412 cmd = perf_path + [ 413 'script', 414 '-i%s' % path] 415 if args.get('verbose'): 416 print(' '.join(shlex.quote(c) for c in cmd)) 417 proc = sp.Popen(cmd, 418 stdout=sp.PIPE, 419 stderr=sp.PIPE if not args.get('verbose') else None, 420 universal_newlines=True, 421 errors='replace', 422 close_fds=False) 423 424 last_filtered = False 425 last_event = '' 426 last_period = 0 427 last_stack = [] 428 deltas = co.defaultdict(lambda: {}) 429 syms_ = co.defaultdict(lambda: {}) 430 at_cache = {} 431 results = {} 432 433 def commit(): 434 # tail-recursively propagate measurements 435 for i in range(len(last_stack)): 436 results_ = results 437 for j in reversed(range(i+1)): 438 if i+1-j > depth: 439 break 440 441 # propagate 442 name = last_stack[j] 443 if name not in results_: 444 results_[name] = (co.defaultdict(lambda: 0), {}) 445 results_[name][0][last_event] += last_period 446 447 # recurse 448 results_ = results_[name][1] 449 450 for line in proc.stdout: 451 # we need to process a lot of data, so wait to use regex as late 452 # as possible 453 if not line.startswith('\t'): 454 if last_filtered: 455 commit() 456 last_filtered = False 457 458 if line: 459 m = sample_pattern.match(line) 460 if m and m.group('event') in events: 461 last_filtered = True 462 last_event = m.group('event') 463 last_period = int(m.group('period'), 0) 464 last_stack = [] 465 466 elif last_filtered: 467 m = frame_pattern.match(line) 468 if m: 469 # filter out internal/kernel functions 470 if not everything and ( 471 m.group('sym').startswith('__') 472 or m.group('sym').startswith('0') 473 or m.group('sym').startswith('-') 474 or m.group('sym').startswith('[') 475 or m.group('dso').startswith('/usr/lib')): 476 continue 477 478 dso = m.group('dso') 479 sym = m.group('sym') 480 off = int(m.group('off'), 0) if m.group('off') else 0 481 addr_ = int(m.group('addr'), 16) 482 483 # get the syms/lines for the dso, this is cached 484 syms, sym_at, lines, line_at = collect_syms_and_lines( 485 dso, 486 **args) 487 488 # ASLR is tricky, we have symbols+offsets, but static symbols 489 # means we may have multiple options for each symbol. 490 # 491 # To try to solve this, we use previous seen symbols to build 492 # confidence for the correct ASLR delta. This means we may 493 # guess incorrectly for early symbols, but this will only affect 494 # a few samples. 495 if sym in syms: 496 sym_addr_ = addr_ - off 497 498 # track possible deltas? 499 for sym_addr, size in syms[sym]: 500 delta = sym_addr - sym_addr_ 501 if delta not in deltas[dso]: 502 deltas[dso][delta] = sum( 503 abs(a_+delta - a) 504 for s, (a_, _) in syms_[dso].items() 505 for a, _ in syms[s]) 506 for delta in deltas[dso].keys(): 507 deltas[dso][delta] += abs(sym_addr_+delta - sym_addr) 508 syms_[dso][sym] = sym_addr_, size 509 510 # guess the best delta 511 delta, _ = min(deltas[dso].items(), 512 key=lambda x: (x[1], x[0])) 513 addr = addr_ + delta 514 515 # cached? 516 if (dso,addr) in at_cache: 517 cached = at_cache[(dso,addr)] 518 if cached is None: 519 # cache says to skip 520 continue 521 file, line = cached 522 else: 523 # find file+line 524 i = bisect.bisect(line_at, addr, key=lambda x: x[0]) 525 if i > 0: 526 _, file, line = line_at[i-1] 527 else: 528 file, line = re.sub('(\.o)?$', '.c', dso, 1), 0 529 530 # ignore filtered sources 531 if sources is not None: 532 if not any( 533 os.path.abspath(file) == os.path.abspath(s) 534 for s in sources): 535 at_cache[(dso,addr)] = None 536 continue 537 else: 538 # default to only cwd 539 if not everything and not os.path.commonpath([ 540 os.getcwd(), 541 os.path.abspath(file)]) == os.getcwd(): 542 at_cache[(dso,addr)] = None 543 continue 544 545 # simplify path 546 if os.path.commonpath([ 547 os.getcwd(), 548 os.path.abspath(file)]) == os.getcwd(): 549 file = os.path.relpath(file) 550 else: 551 file = os.path.abspath(file) 552 553 at_cache[(dso,addr)] = file, line 554 else: 555 file, line = re.sub('(\.o)?$', '.c', dso, 1), 0 556 557 last_stack.append((file, sym, line)) 558 559 # stop propogating? 560 if propagate and len(last_stack) >= propagate: 561 commit() 562 last_filtered = False 563 if last_filtered: 564 commit() 565 566 proc.wait() 567 if proc.returncode != 0: 568 if not args.get('verbose'): 569 for line in proc.stderr: 570 sys.stdout.write(line) 571 sys.exit(-1) 572 573 # rearrange results into result type 574 def to_results(results): 575 results_ = [] 576 for name, (r, children) in results.items(): 577 results_.append(PerfResult(*name, 578 **{events[k]: v for k, v in r.items()}, 579 children=to_results(children))) 580 return results_ 581 582 return to_results(results) 583 584def collect_job(path, i, **args): 585 # decompress into a temporary file, this is to work around 586 # some limitations of perf 587 with zipfile.ZipFile(path) as z: 588 with z.open(i) as f: 589 with tempfile.NamedTemporaryFile('wb') as g: 590 shutil.copyfileobj(f, g) 591 g.flush() 592 593 return collect_decompressed(g.name, **args) 594 595def starapply(args): 596 f, args, kwargs = args 597 return f(*args, **kwargs) 598 599def collect(perf_paths, *, 600 jobs=None, 601 **args): 602 # automatic job detection? 603 if jobs == 0: 604 jobs = len(os.sched_getaffinity(0)) 605 606 records = [] 607 for path in perf_paths: 608 # each .perf file is actually a zip file containing perf files from 609 # multiple runs 610 with zipfile.ZipFile(path) as z: 611 records.extend((path, i) for i in z.infolist()) 612 613 # we're dealing with a lot of data but also surprisingly 614 # parallelizable 615 if jobs is not None: 616 results = [] 617 with mp.Pool(jobs) as p: 618 for results_ in p.imap_unordered( 619 starapply, 620 ((collect_job, (path, i), args) for path, i in records)): 621 results.extend(results_) 622 else: 623 results = [] 624 for path, i in records: 625 results.extend(collect_job(path, i, **args)) 626 627 return results 628 629 630def fold(Result, results, *, 631 by=None, 632 defines=None, 633 **_): 634 if by is None: 635 by = Result._by 636 637 for k in it.chain(by or [], (k for k, _ in defines or [])): 638 if k not in Result._by and k not in Result._fields: 639 print("error: could not find field %r?" % k) 640 sys.exit(-1) 641 642 # filter by matching defines 643 if defines is not None: 644 results_ = [] 645 for r in results: 646 if all(getattr(r, k) in vs for k, vs in defines): 647 results_.append(r) 648 results = results_ 649 650 # organize results into conflicts 651 folding = co.OrderedDict() 652 for r in results: 653 name = tuple(getattr(r, k) for k in by) 654 if name not in folding: 655 folding[name] = [] 656 folding[name].append(r) 657 658 # merge conflicts 659 folded = [] 660 for name, rs in folding.items(): 661 folded.append(sum(rs[1:], start=rs[0])) 662 663 # fold recursively 664 folded_ = [] 665 for r in folded: 666 folded_.append(r._replace(children=fold( 667 Result, r.children, 668 by=by, 669 defines=defines))) 670 folded = folded_ 671 672 return folded 673 674def table(Result, results, diff_results=None, *, 675 by=None, 676 fields=None, 677 sort=None, 678 summary=False, 679 all=False, 680 percent=False, 681 depth=1, 682 **_): 683 all_, all = all, __builtins__.all 684 685 if by is None: 686 by = Result._by 687 if fields is None: 688 fields = Result._fields 689 types = Result._types 690 691 # fold again 692 results = fold(Result, results, by=by) 693 if diff_results is not None: 694 diff_results = fold(Result, diff_results, by=by) 695 696 # organize by name 697 table = { 698 ','.join(str(getattr(r, k) or '') for k in by): r 699 for r in results} 700 diff_table = { 701 ','.join(str(getattr(r, k) or '') for k in by): r 702 for r in diff_results or []} 703 names = list(table.keys() | diff_table.keys()) 704 705 # sort again, now with diff info, note that python's sort is stable 706 names.sort() 707 if diff_results is not None: 708 names.sort(key=lambda n: tuple( 709 types[k].ratio( 710 getattr(table.get(n), k, None), 711 getattr(diff_table.get(n), k, None)) 712 for k in fields), 713 reverse=True) 714 if sort: 715 for k, reverse in reversed(sort): 716 names.sort( 717 key=lambda n: tuple( 718 (getattr(table[n], k),) 719 if getattr(table.get(n), k, None) is not None else () 720 for k in ([k] if k else [ 721 k for k in Result._sort if k in fields])), 722 reverse=reverse ^ (not k or k in Result._fields)) 723 724 725 # build up our lines 726 lines = [] 727 728 # header 729 header = [] 730 header.append('%s%s' % ( 731 ','.join(by), 732 ' (%d added, %d removed)' % ( 733 sum(1 for n in table if n not in diff_table), 734 sum(1 for n in diff_table if n not in table)) 735 if diff_results is not None and not percent else '') 736 if not summary else '') 737 if diff_results is None: 738 for k in fields: 739 header.append(k) 740 elif percent: 741 for k in fields: 742 header.append(k) 743 else: 744 for k in fields: 745 header.append('o'+k) 746 for k in fields: 747 header.append('n'+k) 748 for k in fields: 749 header.append('d'+k) 750 header.append('') 751 lines.append(header) 752 753 def table_entry(name, r, diff_r=None, ratios=[]): 754 entry = [] 755 entry.append(name) 756 if diff_results is None: 757 for k in fields: 758 entry.append(getattr(r, k).table() 759 if getattr(r, k, None) is not None 760 else types[k].none) 761 elif percent: 762 for k in fields: 763 entry.append(getattr(r, k).diff_table() 764 if getattr(r, k, None) is not None 765 else types[k].diff_none) 766 else: 767 for k in fields: 768 entry.append(getattr(diff_r, k).diff_table() 769 if getattr(diff_r, k, None) is not None 770 else types[k].diff_none) 771 for k in fields: 772 entry.append(getattr(r, k).diff_table() 773 if getattr(r, k, None) is not None 774 else types[k].diff_none) 775 for k in fields: 776 entry.append(types[k].diff_diff( 777 getattr(r, k, None), 778 getattr(diff_r, k, None))) 779 if diff_results is None: 780 entry.append('') 781 elif percent: 782 entry.append(' (%s)' % ', '.join( 783 '+∞%' if t == +m.inf 784 else '-∞%' if t == -m.inf 785 else '%+.1f%%' % (100*t) 786 for t in ratios)) 787 else: 788 entry.append(' (%s)' % ', '.join( 789 '+∞%' if t == +m.inf 790 else '-∞%' if t == -m.inf 791 else '%+.1f%%' % (100*t) 792 for t in ratios 793 if t) 794 if any(ratios) else '') 795 return entry 796 797 # entries 798 if not summary: 799 for name in names: 800 r = table.get(name) 801 if diff_results is None: 802 diff_r = None 803 ratios = None 804 else: 805 diff_r = diff_table.get(name) 806 ratios = [ 807 types[k].ratio( 808 getattr(r, k, None), 809 getattr(diff_r, k, None)) 810 for k in fields] 811 if not all_ and not any(ratios): 812 continue 813 lines.append(table_entry(name, r, diff_r, ratios)) 814 815 # total 816 r = next(iter(fold(Result, results, by=[])), None) 817 if diff_results is None: 818 diff_r = None 819 ratios = None 820 else: 821 diff_r = next(iter(fold(Result, diff_results, by=[])), None) 822 ratios = [ 823 types[k].ratio( 824 getattr(r, k, None), 825 getattr(diff_r, k, None)) 826 for k in fields] 827 lines.append(table_entry('TOTAL', r, diff_r, ratios)) 828 829 # find the best widths, note that column 0 contains the names and column -1 830 # the ratios, so those are handled a bit differently 831 widths = [ 832 ((max(it.chain([w], (len(l[i]) for l in lines)))+1+4-1)//4)*4-1 833 for w, i in zip( 834 it.chain([23], it.repeat(7)), 835 range(len(lines[0])-1))] 836 837 # adjust the name width based on the expected call depth, though 838 # note this doesn't really work with unbounded recursion 839 if not summary and not m.isinf(depth): 840 widths[0] += 4*(depth-1) 841 842 # print the tree recursively 843 print('%-*s %s%s' % ( 844 widths[0], lines[0][0], 845 ' '.join('%*s' % (w, x) 846 for w, x in zip(widths[1:], lines[0][1:-1])), 847 lines[0][-1])) 848 849 if not summary: 850 def recurse(results_, depth_, prefixes=('', '', '', '')): 851 # rebuild our tables at each layer 852 table_ = { 853 ','.join(str(getattr(r, k) or '') for k in by): r 854 for r in results_} 855 names_ = list(table_.keys()) 856 857 # sort again at each layer, keep in mind the numbers are 858 # changing as we descend 859 names_.sort() 860 if sort: 861 for k, reverse in reversed(sort): 862 names_.sort( 863 key=lambda n: tuple( 864 (getattr(table_[n], k),) 865 if getattr(table_.get(n), k, None) is not None 866 else () 867 for k in ([k] if k else [ 868 k for k in Result._sort if k in fields])), 869 reverse=reverse ^ (not k or k in Result._fields)) 870 871 for i, name in enumerate(names_): 872 r = table_[name] 873 is_last = (i == len(names_)-1) 874 875 print('%s%-*s %s' % ( 876 prefixes[0+is_last], 877 widths[0] - ( 878 len(prefixes[0+is_last]) 879 if not m.isinf(depth) else 0), 880 name, 881 ' '.join('%*s' % (w, x) 882 for w, x in zip( 883 widths[1:], 884 table_entry(name, r)[1:])))) 885 886 # recurse? 887 if depth_ > 1: 888 recurse( 889 r.children, 890 depth_-1, 891 (prefixes[2+is_last] + "|-> ", 892 prefixes[2+is_last] + "'-> ", 893 prefixes[2+is_last] + "| ", 894 prefixes[2+is_last] + " ")) 895 896 # we have enough going on with diffing to make the top layer 897 # a special case 898 for name, line in zip(names, lines[1:-1]): 899 print('%-*s %s%s' % ( 900 widths[0], line[0], 901 ' '.join('%*s' % (w, x) 902 for w, x in zip(widths[1:], line[1:-1])), 903 line[-1])) 904 905 if name in table and depth > 1: 906 recurse( 907 table[name].children, 908 depth-1, 909 ("|-> ", 910 "'-> ", 911 "| ", 912 " ")) 913 914 print('%-*s %s%s' % ( 915 widths[0], lines[-1][0], 916 ' '.join('%*s' % (w, x) 917 for w, x in zip(widths[1:], lines[-1][1:-1])), 918 lines[-1][-1])) 919 920 921def annotate(Result, results, *, 922 annotate=None, 923 threshold=None, 924 branches=False, 925 caches=False, 926 **args): 927 # figure out the threshold 928 if threshold is None: 929 t0, t1 = THRESHOLD 930 elif len(threshold) == 1: 931 t0, t1 = threshold[0], threshold[0] 932 else: 933 t0, t1 = threshold 934 t0, t1 = min(t0, t1), max(t0, t1) 935 936 if not branches and not caches: 937 tk = 'cycles' 938 elif branches: 939 tk = 'bmisses' 940 else: 941 tk = 'cmisses' 942 943 # find max cycles 944 max_ = max(it.chain((float(getattr(r, tk)) for r in results), [1])) 945 946 for path in co.OrderedDict.fromkeys(r.file for r in results).keys(): 947 # flatten to line info 948 results = fold(Result, results, by=['file', 'line']) 949 table = {r.line: r for r in results if r.file == path} 950 951 # calculate spans to show 952 if not annotate: 953 spans = [] 954 last = None 955 func = None 956 for line, r in sorted(table.items()): 957 if float(getattr(r, tk)) / max_ >= t0: 958 if last is not None and line - last.stop <= args['context']: 959 last = range( 960 last.start, 961 line+1+args['context']) 962 else: 963 if last is not None: 964 spans.append((last, func)) 965 last = range( 966 line-args['context'], 967 line+1+args['context']) 968 func = r.function 969 if last is not None: 970 spans.append((last, func)) 971 972 with open(path) as f: 973 skipped = False 974 for i, line in enumerate(f): 975 # skip lines not in spans? 976 if not annotate and not any(i+1 in s for s, _ in spans): 977 skipped = True 978 continue 979 980 if skipped: 981 skipped = False 982 print('%s@@ %s:%d: %s @@%s' % ( 983 '\x1b[36m' if args['color'] else '', 984 path, 985 i+1, 986 next(iter(f for _, f in spans)), 987 '\x1b[m' if args['color'] else '')) 988 989 # build line 990 if line.endswith('\n'): 991 line = line[:-1] 992 993 r = table.get(i+1) 994 if r is not None and ( 995 float(r.cycles) > 0 996 if not branches and not caches 997 else float(r.bmisses) > 0 or float(r.branches) > 0 998 if branches 999 else float(r.cmisses) > 0 or float(r.caches) > 0): 1000 line = '%-*s // %s' % ( 1001 args['width'], 1002 line, 1003 '%s cycles' % r.cycles 1004 if not branches and not caches 1005 else '%s bmisses, %s branches' % (r.bmisses, r.branches) 1006 if branches 1007 else '%s cmisses, %s caches' % (r.cmisses, r.caches)) 1008 1009 if args['color']: 1010 if float(getattr(r, tk)) / max_ >= t1: 1011 line = '\x1b[1;31m%s\x1b[m' % line 1012 elif float(getattr(r, tk)) / max_ >= t0: 1013 line = '\x1b[35m%s\x1b[m' % line 1014 1015 print(line) 1016 1017 1018def report(perf_paths, *, 1019 by=None, 1020 fields=None, 1021 defines=None, 1022 sort=None, 1023 branches=False, 1024 caches=False, 1025 **args): 1026 # figure out what color should be 1027 if args.get('color') == 'auto': 1028 args['color'] = sys.stdout.isatty() 1029 elif args.get('color') == 'always': 1030 args['color'] = True 1031 else: 1032 args['color'] = False 1033 1034 # depth of 0 == m.inf 1035 if args.get('depth') == 0: 1036 args['depth'] = m.inf 1037 1038 # find sizes 1039 if not args.get('use', None): 1040 results = collect(perf_paths, **args) 1041 else: 1042 results = [] 1043 with openio(args['use']) as f: 1044 reader = csv.DictReader(f, restval='') 1045 for r in reader: 1046 if not any('perf_'+k in r and r['perf_'+k].strip() 1047 for k in PerfResult._fields): 1048 continue 1049 try: 1050 results.append(PerfResult( 1051 **{k: r[k] for k in PerfResult._by 1052 if k in r and r[k].strip()}, 1053 **{k: r['perf_'+k] for k in PerfResult._fields 1054 if 'perf_'+k in r and r['perf_'+k].strip()})) 1055 except TypeError: 1056 pass 1057 1058 # fold 1059 results = fold(PerfResult, results, by=by, defines=defines) 1060 1061 # sort, note that python's sort is stable 1062 results.sort() 1063 if sort: 1064 for k, reverse in reversed(sort): 1065 results.sort( 1066 key=lambda r: tuple( 1067 (getattr(r, k),) if getattr(r, k) is not None else () 1068 for k in ([k] if k else PerfResult._sort)), 1069 reverse=reverse ^ (not k or k in PerfResult._fields)) 1070 1071 # write results to CSV 1072 if args.get('output'): 1073 with openio(args['output'], 'w') as f: 1074 writer = csv.DictWriter(f, 1075 (by if by is not None else PerfResult._by) 1076 + ['perf_'+k for k in ( 1077 fields if fields is not None else PerfResult._fields)]) 1078 writer.writeheader() 1079 for r in results: 1080 writer.writerow( 1081 {k: getattr(r, k) for k in ( 1082 by if by is not None else PerfResult._by)} 1083 | {'perf_'+k: getattr(r, k) for k in ( 1084 fields if fields is not None else PerfResult._fields)}) 1085 1086 # find previous results? 1087 if args.get('diff'): 1088 diff_results = [] 1089 try: 1090 with openio(args['diff']) as f: 1091 reader = csv.DictReader(f, restval='') 1092 for r in reader: 1093 if not any('perf_'+k in r and r['perf_'+k].strip() 1094 for k in PerfResult._fields): 1095 continue 1096 try: 1097 diff_results.append(PerfResult( 1098 **{k: r[k] for k in PerfResult._by 1099 if k in r and r[k].strip()}, 1100 **{k: r['perf_'+k] for k in PerfResult._fields 1101 if 'perf_'+k in r and r['perf_'+k].strip()})) 1102 except TypeError: 1103 pass 1104 except FileNotFoundError: 1105 pass 1106 1107 # fold 1108 diff_results = fold(PerfResult, diff_results, by=by, defines=defines) 1109 1110 # print table 1111 if not args.get('quiet'): 1112 if args.get('annotate') or args.get('threshold'): 1113 # annotate sources 1114 annotate(PerfResult, results, 1115 branches=branches, 1116 caches=caches, 1117 **args) 1118 else: 1119 # print table 1120 table(PerfResult, results, 1121 diff_results if args.get('diff') else None, 1122 by=by if by is not None else ['function'], 1123 fields=fields if fields is not None 1124 else ['cycles'] if not branches and not caches 1125 else ['bmisses', 'branches'] if branches 1126 else ['cmisses', 'caches'], 1127 sort=sort, 1128 **args) 1129 1130 1131def main(**args): 1132 if args.get('record'): 1133 return record(**args) 1134 else: 1135 return report(**args) 1136 1137 1138if __name__ == "__main__": 1139 import argparse 1140 import sys 1141 1142 # bit of a hack, but parse_intermixed_args and REMAINDER are 1143 # incompatible, so we need to figure out what we want before running 1144 # argparse 1145 if '-R' in sys.argv or '--record' in sys.argv: 1146 nargs = argparse.REMAINDER 1147 else: 1148 nargs = '*' 1149 1150 argparse.ArgumentParser._handle_conflict_ignore = lambda *_: None 1151 argparse._ArgumentGroup._handle_conflict_ignore = lambda *_: None 1152 parser = argparse.ArgumentParser( 1153 description="Aggregate and report Linux perf results.", 1154 allow_abbrev=False, 1155 conflict_handler='ignore') 1156 parser.add_argument( 1157 'perf_paths', 1158 nargs=nargs, 1159 help="Input *.perf files.") 1160 parser.add_argument( 1161 '-v', '--verbose', 1162 action='store_true', 1163 help="Output commands that run behind the scenes.") 1164 parser.add_argument( 1165 '-q', '--quiet', 1166 action='store_true', 1167 help="Don't show anything, useful with -o.") 1168 parser.add_argument( 1169 '-o', '--output', 1170 help="Specify CSV file to store results.") 1171 parser.add_argument( 1172 '-u', '--use', 1173 help="Don't parse anything, use this CSV file.") 1174 parser.add_argument( 1175 '-d', '--diff', 1176 help="Specify CSV file to diff against.") 1177 parser.add_argument( 1178 '-a', '--all', 1179 action='store_true', 1180 help="Show all, not just the ones that changed.") 1181 parser.add_argument( 1182 '-p', '--percent', 1183 action='store_true', 1184 help="Only show percentage change, not a full diff.") 1185 parser.add_argument( 1186 '-b', '--by', 1187 action='append', 1188 choices=PerfResult._by, 1189 help="Group by this field.") 1190 parser.add_argument( 1191 '-f', '--field', 1192 dest='fields', 1193 action='append', 1194 choices=PerfResult._fields, 1195 help="Show this field.") 1196 parser.add_argument( 1197 '-D', '--define', 1198 dest='defines', 1199 action='append', 1200 type=lambda x: (lambda k,v: (k, set(v.split(','))))(*x.split('=', 1)), 1201 help="Only include results where this field is this value.") 1202 class AppendSort(argparse.Action): 1203 def __call__(self, parser, namespace, value, option): 1204 if namespace.sort is None: 1205 namespace.sort = [] 1206 namespace.sort.append((value, True if option == '-S' else False)) 1207 parser.add_argument( 1208 '-s', '--sort', 1209 nargs='?', 1210 action=AppendSort, 1211 help="Sort by this field.") 1212 parser.add_argument( 1213 '-S', '--reverse-sort', 1214 nargs='?', 1215 action=AppendSort, 1216 help="Sort by this field, but backwards.") 1217 parser.add_argument( 1218 '-Y', '--summary', 1219 action='store_true', 1220 help="Only show the total.") 1221 parser.add_argument( 1222 '-F', '--source', 1223 dest='sources', 1224 action='append', 1225 help="Only consider definitions in this file. Defaults to anything " 1226 "in the current directory.") 1227 parser.add_argument( 1228 '--everything', 1229 action='store_true', 1230 help="Include builtin and libc specific symbols.") 1231 parser.add_argument( 1232 '--branches', 1233 action='store_true', 1234 help="Show branches and branch misses.") 1235 parser.add_argument( 1236 '--caches', 1237 action='store_true', 1238 help="Show cache accesses and cache misses.") 1239 parser.add_argument( 1240 '-P', '--propagate', 1241 type=lambda x: int(x, 0), 1242 help="Depth to propagate samples up the call-stack. 0 propagates up " 1243 "to the entry point, 1 does no propagation. Defaults to 0.") 1244 parser.add_argument( 1245 '-Z', '--depth', 1246 nargs='?', 1247 type=lambda x: int(x, 0), 1248 const=0, 1249 help="Depth of function calls to show. 0 shows all calls but may not " 1250 "terminate!") 1251 parser.add_argument( 1252 '-A', '--annotate', 1253 action='store_true', 1254 help="Show source files annotated with coverage info.") 1255 parser.add_argument( 1256 '-T', '--threshold', 1257 nargs='?', 1258 type=lambda x: tuple(float(x) for x in x.split(',')), 1259 const=THRESHOLD, 1260 help="Show lines with samples above this threshold as a percent of " 1261 "all lines. Defaults to %s." % ','.join(str(t) for t in THRESHOLD)) 1262 parser.add_argument( 1263 '-c', '--context', 1264 type=lambda x: int(x, 0), 1265 default=3, 1266 help="Show n additional lines of context. Defaults to 3.") 1267 parser.add_argument( 1268 '-W', '--width', 1269 type=lambda x: int(x, 0), 1270 default=80, 1271 help="Assume source is styled with this many columns. Defaults to 80.") 1272 parser.add_argument( 1273 '--color', 1274 choices=['never', 'always', 'auto'], 1275 default='auto', 1276 help="When to use terminal colors. Defaults to 'auto'.") 1277 parser.add_argument( 1278 '-j', '--jobs', 1279 nargs='?', 1280 type=lambda x: int(x, 0), 1281 const=0, 1282 help="Number of processes to use. 0 spawns one process per core.") 1283 parser.add_argument( 1284 '--perf-path', 1285 type=lambda x: x.split(), 1286 help="Path to the perf executable, may include flags. " 1287 "Defaults to %r." % PERF_PATH) 1288 parser.add_argument( 1289 '--objdump-path', 1290 type=lambda x: x.split(), 1291 default=OBJDUMP_PATH, 1292 help="Path to the objdump executable, may include flags. " 1293 "Defaults to %r." % OBJDUMP_PATH) 1294 1295 # record flags 1296 record_parser = parser.add_argument_group('record options') 1297 record_parser.add_argument( 1298 'command', 1299 nargs=nargs, 1300 help="Command to run.") 1301 record_parser.add_argument( 1302 '-R', '--record', 1303 action='store_true', 1304 help="Run a command and aggregate perf measurements.") 1305 record_parser.add_argument( 1306 '-o', '--output', 1307 help="Output file. Uses flock to synchronize. This is stored as a " 1308 "zip-file of multiple perf results.") 1309 record_parser.add_argument( 1310 '--perf-freq', 1311 help="perf sampling frequency. This is passed directly to perf. " 1312 "Defaults to %r." % PERF_FREQ) 1313 record_parser.add_argument( 1314 '--perf-period', 1315 help="perf sampling period. This is passed directly to perf.") 1316 record_parser.add_argument( 1317 '--perf-events', 1318 help="perf events to record. This is passed directly to perf. " 1319 "Defaults to %r." % PERF_EVENTS) 1320 record_parser.add_argument( 1321 '--perf-path', 1322 type=lambda x: x.split(), 1323 help="Path to the perf executable, may include flags. " 1324 "Defaults to %r." % PERF_PATH) 1325 1326 # avoid intermixed/REMAINDER conflict, see above 1327 if nargs == argparse.REMAINDER: 1328 args = parser.parse_args() 1329 else: 1330 args = parser.parse_intermixed_args() 1331 1332 # perf_paths/command overlap, so need to do some munging here 1333 args.command = args.perf_paths 1334 if args.record: 1335 if not args.command: 1336 print('error: no command specified?') 1337 sys.exit(-1) 1338 if not args.output: 1339 print('error: no output file specified?') 1340 sys.exit(-1) 1341 1342 sys.exit(main(**{k: v 1343 for k, v in vars(args).items() 1344 if v is not None})) 1345