1#!/usr/bin/env python3
2#
3# Script to summarize the outputs of other scripts. Operates on CSV files.
4#
5# Example:
6# ./scripts/code.py lfs.o lfs_util.o -q -o lfs.code.csv
7# ./scripts/data.py lfs.o lfs_util.o -q -o lfs.data.csv
8# ./scripts/summary.py lfs.code.csv lfs.data.csv -q -o lfs.csv
9# ./scripts/summary.py -Y lfs.csv -f code=code_size,data=data_size
10#
11# Copyright (c) 2022, The littlefs authors.
12# SPDX-License-Identifier: BSD-3-Clause
13#
14
15import collections as co
16import csv
17import functools as ft
18import itertools as it
19import math as m
20import os
21import re
22
23
24# supported merge operations
25#
26# this is a terrible way to express these
27#
28OPS = {
29    'sum':     lambda xs: sum(xs[1:], start=xs[0]),
30    'prod':    lambda xs: m.prod(xs[1:], start=xs[0]),
31    'min':     min,
32    'max':     max,
33    'mean':    lambda xs: Float(sum(float(x) for x in xs) / len(xs)),
34    'stddev':  lambda xs: (
35        lambda mean: Float(
36            m.sqrt(sum((float(x) - mean)**2 for x in xs) / len(xs)))
37        )(sum(float(x) for x in xs) / len(xs)),
38    'gmean':   lambda xs: Float(m.prod(float(x) for x in xs)**(1/len(xs))),
39    'gstddev': lambda xs: (
40        lambda gmean: Float(
41            m.exp(m.sqrt(sum(m.log(float(x)/gmean)**2 for x in xs) / len(xs)))
42            if gmean else m.inf)
43        )(m.prod(float(x) for x in xs)**(1/len(xs))),
44}
45
46
47# integer fields
48class Int(co.namedtuple('Int', 'x')):
49    __slots__ = ()
50    def __new__(cls, x=0):
51        if isinstance(x, Int):
52            return x
53        if isinstance(x, str):
54            try:
55                x = int(x, 0)
56            except ValueError:
57                # also accept +-∞ and +-inf
58                if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
59                    x = m.inf
60                elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
61                    x = -m.inf
62                else:
63                    raise
64        assert isinstance(x, int) or m.isinf(x), x
65        return super().__new__(cls, x)
66
67    def __str__(self):
68        if self.x == m.inf:
69            return '∞'
70        elif self.x == -m.inf:
71            return '-∞'
72        else:
73            return str(self.x)
74
75    def __int__(self):
76        assert not m.isinf(self.x)
77        return self.x
78
79    def __float__(self):
80        return float(self.x)
81
82    none = '%7s' % '-'
83    def table(self):
84        return '%7s' % (self,)
85
86    diff_none = '%7s' % '-'
87    diff_table = table
88
89    def diff_diff(self, other):
90        new = self.x if self else 0
91        old = other.x if other else 0
92        diff = new - old
93        if diff == +m.inf:
94            return '%7s' % '+∞'
95        elif diff == -m.inf:
96            return '%7s' % '-∞'
97        else:
98            return '%+7d' % diff
99
100    def ratio(self, other):
101        new = self.x if self else 0
102        old = other.x if other else 0
103        if m.isinf(new) and m.isinf(old):
104            return 0.0
105        elif m.isinf(new):
106            return +m.inf
107        elif m.isinf(old):
108            return -m.inf
109        elif not old and not new:
110            return 0.0
111        elif not old:
112            return 1.0
113        else:
114            return (new-old) / old
115
116    def __add__(self, other):
117        return self.__class__(self.x + other.x)
118
119    def __sub__(self, other):
120        return self.__class__(self.x - other.x)
121
122    def __mul__(self, other):
123        return self.__class__(self.x * other.x)
124
125# float fields
126class Float(co.namedtuple('Float', 'x')):
127    __slots__ = ()
128    def __new__(cls, x=0.0):
129        if isinstance(x, Float):
130            return x
131        if isinstance(x, str):
132            try:
133                x = float(x)
134            except ValueError:
135                # also accept +-∞ and +-inf
136                if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
137                    x = m.inf
138                elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
139                    x = -m.inf
140                else:
141                    raise
142        assert isinstance(x, float), x
143        return super().__new__(cls, x)
144
145    def __str__(self):
146        if self.x == m.inf:
147            return '∞'
148        elif self.x == -m.inf:
149            return '-∞'
150        else:
151            return '%.1f' % self.x
152
153    def __float__(self):
154        return float(self.x)
155
156    none = Int.none
157    table = Int.table
158    diff_none = Int.diff_none
159    diff_table = Int.diff_table
160    diff_diff = Int.diff_diff
161    ratio = Int.ratio
162    __add__ = Int.__add__
163    __sub__ = Int.__sub__
164    __mul__ = Int.__mul__
165
166# fractional fields, a/b
167class Frac(co.namedtuple('Frac', 'a,b')):
168    __slots__ = ()
169    def __new__(cls, a=0, b=None):
170        if isinstance(a, Frac) and b is None:
171            return a
172        if isinstance(a, str) and b is None:
173            a, b = a.split('/', 1)
174        if b is None:
175            b = a
176        return super().__new__(cls, Int(a), Int(b))
177
178    def __str__(self):
179        return '%s/%s' % (self.a, self.b)
180
181    def __float__(self):
182        return float(self.a)
183
184    none = '%11s %7s' % ('-', '-')
185    def table(self):
186        t = self.a.x/self.b.x if self.b.x else 1.0
187        return '%11s %7s' % (
188            self,
189            '∞%' if t == +m.inf
190            else '-∞%' if t == -m.inf
191            else '%.1f%%' % (100*t))
192
193    diff_none = '%11s' % '-'
194    def diff_table(self):
195        return '%11s' % (self,)
196
197    def diff_diff(self, other):
198        new_a, new_b = self if self else (Int(0), Int(0))
199        old_a, old_b = other if other else (Int(0), Int(0))
200        return '%11s' % ('%s/%s' % (
201            new_a.diff_diff(old_a).strip(),
202            new_b.diff_diff(old_b).strip()))
203
204    def ratio(self, other):
205        new_a, new_b = self if self else (Int(0), Int(0))
206        old_a, old_b = other if other else (Int(0), Int(0))
207        new = new_a.x/new_b.x if new_b.x else 1.0
208        old = old_a.x/old_b.x if old_b.x else 1.0
209        return new - old
210
211    def __add__(self, other):
212        return self.__class__(self.a + other.a, self.b + other.b)
213
214    def __sub__(self, other):
215        return self.__class__(self.a - other.a, self.b - other.b)
216
217    def __mul__(self, other):
218        return self.__class__(self.a * other.a, self.b + other.b)
219
220    def __lt__(self, other):
221        self_t = self.a.x/self.b.x if self.b.x else 1.0
222        other_t = other.a.x/other.b.x if other.b.x else 1.0
223        return (self_t, self.a.x) < (other_t, other.a.x)
224
225    def __gt__(self, other):
226        return self.__class__.__lt__(other, self)
227
228    def __le__(self, other):
229        return not self.__gt__(other)
230
231    def __ge__(self, other):
232        return not self.__lt__(other)
233
234# available types
235TYPES = co.OrderedDict([
236    ('int',   Int),
237    ('float', Float),
238    ('frac',  Frac)
239])
240
241
242def infer(results, *,
243        by=None,
244        fields=None,
245        types={},
246        ops={},
247        renames=[],
248        **_):
249    # if fields not specified, try to guess from data
250    if fields is None:
251        fields = co.OrderedDict()
252        for r in results:
253            for k, v in r.items():
254                if (by is None or k not in by) and v.strip():
255                    types_ = []
256                    for t in fields.get(k, TYPES.values()):
257                        try:
258                            t(v)
259                            types_.append(t)
260                        except ValueError:
261                            pass
262                    fields[k] = types_
263        fields = list(k for k, v in fields.items() if v)
264
265    # deduplicate fields
266    fields = list(co.OrderedDict.fromkeys(fields).keys())
267
268    # if by not specified, guess it's anything not in fields and not a
269    # source of a rename
270    if by is None:
271        by = co.OrderedDict()
272        for r in results:
273            # also ignore None keys, these are introduced by csv.DictReader
274            # when header + row mismatch
275            by.update((k, True) for k in r.keys()
276                if k is not None
277                    and k not in fields
278                    and not any(k == old_k for _, old_k in renames))
279        by = list(by.keys())
280
281    # deduplicate fields
282    by = list(co.OrderedDict.fromkeys(by).keys())
283
284    # find best type for all fields
285    types_ = {}
286    for k in fields:
287        if k in types:
288            types_[k] = types[k]
289        else:
290            for t in TYPES.values():
291                for r in results:
292                    if k in r and r[k].strip():
293                        try:
294                            t(r[k])
295                        except ValueError:
296                            break
297                else:
298                    types_[k] = t
299                    break
300            else:
301                print("error: no type matches field %r?" % k)
302                sys.exit(-1)
303    types = types_
304
305    # does folding change the type?
306    types_ = {}
307    for k, t in types.items():
308        types_[k] = ops.get(k, OPS['sum'])([t()]).__class__
309
310
311    # create result class
312    def __new__(cls, **r):
313        return cls.__mro__[1].__new__(cls,
314            **{k: r.get(k, '') for k in by},
315            **{k: r[k] if k in r and isinstance(r[k], list)
316                else [types[k](r[k])] if k in r
317                else []
318                for k in fields})
319
320    def __add__(self, other):
321        return self.__class__(
322            **{k: getattr(self, k) for k in by},
323            **{k: object.__getattribute__(self, k)
324                + object.__getattribute__(other, k)
325                for k in fields})
326
327    def __getattribute__(self, k):
328        if k in fields:
329            if object.__getattribute__(self, k):
330                return ops.get(k, OPS['sum'])(object.__getattribute__(self, k))
331            else:
332                return None
333        return object.__getattribute__(self, k)
334
335    return type('Result', (co.namedtuple('Result', by + fields),), {
336        '__slots__': (),
337        '__new__': __new__,
338        '__add__': __add__,
339        '__getattribute__': __getattribute__,
340        '_by': by,
341        '_fields': fields,
342        '_sort': fields,
343        '_types': types_,
344    })
345
346
347def fold(Result, results, *,
348        by=None,
349        defines=None,
350        **_):
351    if by is None:
352        by = Result._by
353
354    for k in it.chain(by or [], (k for k, _ in defines or [])):
355        if k not in Result._by and k not in Result._fields:
356            print("error: could not find field %r?" % k)
357            sys.exit(-1)
358
359    # filter by matching defines
360    if defines is not None:
361        results_ = []
362        for r in results:
363            if all(getattr(r, k) in vs for k, vs in defines):
364                results_.append(r)
365        results = results_
366
367    # organize results into conflicts
368    folding = co.OrderedDict()
369    for r in results:
370        name = tuple(getattr(r, k) for k in by)
371        if name not in folding:
372            folding[name] = []
373        folding[name].append(r)
374
375    # merge conflicts
376    folded = []
377    for name, rs in folding.items():
378        folded.append(sum(rs[1:], start=rs[0]))
379
380    return folded
381
382def table(Result, results, diff_results=None, *,
383        by=None,
384        fields=None,
385        sort=None,
386        summary=False,
387        all=False,
388        percent=False,
389        **_):
390    all_, all = all, __builtins__.all
391
392    if by is None:
393        by = Result._by
394    if fields is None:
395        fields = Result._fields
396    types = Result._types
397
398    # fold again
399    results = fold(Result, results, by=by)
400    if diff_results is not None:
401        diff_results = fold(Result, diff_results, by=by)
402
403    # organize by name
404    table = {
405        ','.join(str(getattr(r, k) or '') for k in by): r
406        for r in results}
407    diff_table = {
408        ','.join(str(getattr(r, k) or '') for k in by): r
409        for r in diff_results or []}
410    names = list(table.keys() | diff_table.keys())
411
412    # sort again, now with diff info, note that python's sort is stable
413    names.sort()
414    if diff_results is not None:
415        names.sort(key=lambda n: tuple(
416            types[k].ratio(
417                getattr(table.get(n), k, None),
418                getattr(diff_table.get(n), k, None))
419            for k in fields),
420            reverse=True)
421    if sort:
422        for k, reverse in reversed(sort):
423            names.sort(
424                key=lambda n: tuple(
425                    (getattr(table[n], k),)
426                    if getattr(table.get(n), k, None) is not None else ()
427                    for k in ([k] if k else [
428                        k for k in Result._sort if k in fields])),
429                reverse=reverse ^ (not k or k in Result._fields))
430
431
432    # build up our lines
433    lines = []
434
435    # header
436    header = []
437    header.append('%s%s' % (
438        ','.join(by),
439        ' (%d added, %d removed)' % (
440            sum(1 for n in table if n not in diff_table),
441            sum(1 for n in diff_table if n not in table))
442            if diff_results is not None and not percent else '')
443        if not summary else '')
444    if diff_results is None:
445        for k in fields:
446            header.append(k)
447    elif percent:
448        for k in fields:
449            header.append(k)
450    else:
451        for k in fields:
452            header.append('o'+k)
453        for k in fields:
454            header.append('n'+k)
455        for k in fields:
456            header.append('d'+k)
457    header.append('')
458    lines.append(header)
459
460    def table_entry(name, r, diff_r=None, ratios=[]):
461        entry = []
462        entry.append(name)
463        if diff_results is None:
464            for k in fields:
465                entry.append(getattr(r, k).table()
466                    if getattr(r, k, None) is not None
467                    else types[k].none)
468        elif percent:
469            for k in fields:
470                entry.append(getattr(r, k).diff_table()
471                    if getattr(r, k, None) is not None
472                    else types[k].diff_none)
473        else:
474            for k in fields:
475                entry.append(getattr(diff_r, k).diff_table()
476                    if getattr(diff_r, k, None) is not None
477                    else types[k].diff_none)
478            for k in fields:
479                entry.append(getattr(r, k).diff_table()
480                    if getattr(r, k, None) is not None
481                    else types[k].diff_none)
482            for k in fields:
483                entry.append(types[k].diff_diff(
484                        getattr(r, k, None),
485                        getattr(diff_r, k, None)))
486        if diff_results is None:
487            entry.append('')
488        elif percent:
489            entry.append(' (%s)' % ', '.join(
490                '+∞%' if t == +m.inf
491                else '-∞%' if t == -m.inf
492                else '%+.1f%%' % (100*t)
493                for t in ratios))
494        else:
495            entry.append(' (%s)' % ', '.join(
496                    '+∞%' if t == +m.inf
497                    else '-∞%' if t == -m.inf
498                    else '%+.1f%%' % (100*t)
499                    for t in ratios
500                    if t)
501                if any(ratios) else '')
502        return entry
503
504    # entries
505    if not summary:
506        for name in names:
507            r = table.get(name)
508            if diff_results is None:
509                diff_r = None
510                ratios = None
511            else:
512                diff_r = diff_table.get(name)
513                ratios = [
514                    types[k].ratio(
515                        getattr(r, k, None),
516                        getattr(diff_r, k, None))
517                    for k in fields]
518                if not all_ and not any(ratios):
519                    continue
520            lines.append(table_entry(name, r, diff_r, ratios))
521
522    # total
523    r = next(iter(fold(Result, results, by=[])), None)
524    if diff_results is None:
525        diff_r = None
526        ratios = None
527    else:
528        diff_r = next(iter(fold(Result, diff_results, by=[])), None)
529        ratios = [
530            types[k].ratio(
531                getattr(r, k, None),
532                getattr(diff_r, k, None))
533            for k in fields]
534    lines.append(table_entry('TOTAL', r, diff_r, ratios))
535
536    # find the best widths, note that column 0 contains the names and column -1
537    # the ratios, so those are handled a bit differently
538    widths = [
539        ((max(it.chain([w], (len(l[i]) for l in lines)))+1+4-1)//4)*4-1
540        for w, i in zip(
541            it.chain([23], it.repeat(7)),
542            range(len(lines[0])-1))]
543
544    # print our table
545    for line in lines:
546        print('%-*s  %s%s' % (
547            widths[0], line[0],
548            ' '.join('%*s' % (w, x)
549                for w, x in zip(widths[1:], line[1:-1])),
550            line[-1]))
551
552
553def openio(path, mode='r', buffering=-1):
554    # allow '-' for stdin/stdout
555    if path == '-':
556        if mode == 'r':
557            return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering)
558        else:
559            return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering)
560    else:
561        return open(path, mode, buffering)
562
563def main(csv_paths, *,
564        by=None,
565        fields=None,
566        defines=None,
567        sort=None,
568        **args):
569    # separate out renames
570    renames = list(it.chain.from_iterable(
571        ((k, v) for v in vs)
572        for k, vs in it.chain(by or [], fields or [])))
573    if by is not None:
574        by = [k for k, _ in by]
575    if fields is not None:
576        fields = [k for k, _ in fields]
577
578    # figure out types
579    types = {}
580    for t in TYPES.keys():
581        for k in args.get(t, []):
582            if k in types:
583                print("error: conflicting type for field %r?" % k)
584                sys.exit(-1)
585            types[k] = TYPES[t]
586    # rename types?
587    if renames:
588        types_ = {}
589        for new_k, old_k in renames:
590            if old_k in types:
591                types_[new_k] = types[old_k]
592        types.update(types_)
593
594    # figure out merge operations
595    ops = {}
596    for o in OPS.keys():
597        for k in args.get(o, []):
598            if k in ops:
599                print("error: conflicting op for field %r?" % k)
600                sys.exit(-1)
601            ops[k] = OPS[o]
602    # rename ops?
603    if renames:
604        ops_ = {}
605        for new_k, old_k in renames:
606            if old_k in ops:
607                ops_[new_k] = ops[old_k]
608        ops.update(ops_)
609
610    # find CSV files
611    results = []
612    for path in csv_paths:
613        try:
614            with openio(path) as f:
615                reader = csv.DictReader(f, restval='')
616                for r in reader:
617                    # rename fields?
618                    if renames:
619                        # make a copy so renames can overlap
620                        r_ = {}
621                        for new_k, old_k in renames:
622                            if old_k in r:
623                                r_[new_k] = r[old_k]
624                        r.update(r_)
625
626                    results.append(r)
627        except FileNotFoundError:
628            pass
629
630    # homogenize
631    Result = infer(results,
632        by=by,
633        fields=fields,
634        types=types,
635        ops=ops,
636        renames=renames)
637    results_ = []
638    for r in results:
639        if not any(k in r and r[k].strip()
640                for k in Result._fields):
641            continue
642        try:
643            results_.append(Result(**{
644                k: r[k] for k in Result._by + Result._fields
645                if k in r and r[k].strip()}))
646        except TypeError:
647            pass
648    results = results_
649
650    # fold
651    results = fold(Result, results, by=by, defines=defines)
652
653    # sort, note that python's sort is stable
654    results.sort()
655    if sort:
656        for k, reverse in reversed(sort):
657            results.sort(
658                key=lambda r: tuple(
659                    (getattr(r, k),) if getattr(r, k) is not None else ()
660                    for k in ([k] if k else Result._sort)),
661                reverse=reverse ^ (not k or k in Result._fields))
662
663    # write results to CSV
664    if args.get('output'):
665        with openio(args['output'], 'w') as f:
666            writer = csv.DictWriter(f, Result._by + Result._fields)
667            writer.writeheader()
668            for r in results:
669                # note we need to go through getattr to resolve lazy fields
670                writer.writerow({
671                    k: getattr(r, k) for k in Result._by + Result._fields})
672
673    # find previous results?
674    if args.get('diff'):
675        diff_results = []
676        try:
677            with openio(args['diff']) as f:
678                reader = csv.DictReader(f, restval='')
679                for r in reader:
680                    # rename fields?
681                    if renames:
682                        # make a copy so renames can overlap
683                        r_ = {}
684                        for new_k, old_k in renames:
685                            if old_k in r:
686                                r_[new_k] = r[old_k]
687                        r.update(r_)
688
689                    if not any(k in r and r[k].strip()
690                            for k in Result._fields):
691                        continue
692                    try:
693                        diff_results.append(Result(**{
694                            k: r[k] for k in Result._by + Result._fields
695                            if k in r and r[k].strip()}))
696                    except TypeError:
697                        pass
698        except FileNotFoundError:
699            pass
700
701        # fold
702        diff_results = fold(Result, diff_results, by=by, defines=defines)
703
704    # print table
705    if not args.get('quiet'):
706        table(Result, results,
707            diff_results if args.get('diff') else None,
708            by=by,
709            fields=fields,
710            sort=sort,
711            **args)
712
713
714if __name__ == "__main__":
715    import argparse
716    import sys
717    parser = argparse.ArgumentParser(
718        description="Summarize measurements in CSV files.",
719        allow_abbrev=False)
720    parser.add_argument(
721        'csv_paths',
722        nargs='*',
723        help="Input *.csv files.")
724    parser.add_argument(
725        '-q', '--quiet',
726        action='store_true',
727        help="Don't show anything, useful with -o.")
728    parser.add_argument(
729        '-o', '--output',
730        help="Specify CSV file to store results.")
731    parser.add_argument(
732        '-d', '--diff',
733        help="Specify CSV file to diff against.")
734    parser.add_argument(
735        '-a', '--all',
736        action='store_true',
737        help="Show all, not just the ones that changed.")
738    parser.add_argument(
739        '-p', '--percent',
740        action='store_true',
741        help="Only show percentage change, not a full diff.")
742    parser.add_argument(
743        '-b', '--by',
744        action='append',
745        type=lambda x: (
746            lambda k,v=None: (k, v.split(',') if v is not None else ())
747            )(*x.split('=', 1)),
748        help="Group by this field. Can rename fields with new_name=old_name.")
749    parser.add_argument(
750        '-f', '--field',
751        dest='fields',
752        action='append',
753        type=lambda x: (
754            lambda k,v=None: (k, v.split(',') if v is not None else ())
755            )(*x.split('=', 1)),
756        help="Show this field. Can rename fields with new_name=old_name.")
757    parser.add_argument(
758        '-D', '--define',
759        dest='defines',
760        action='append',
761        type=lambda x: (lambda k,v: (k, set(v.split(','))))(*x.split('=', 1)),
762        help="Only include results where this field is this value. May include "
763            "comma-separated options.")
764    class AppendSort(argparse.Action):
765        def __call__(self, parser, namespace, value, option):
766            if namespace.sort is None:
767                namespace.sort = []
768            namespace.sort.append((value, True if option == '-S' else False))
769    parser.add_argument(
770        '-s', '--sort',
771        nargs='?',
772        action=AppendSort,
773        help="Sort by this field.")
774    parser.add_argument(
775        '-S', '--reverse-sort',
776        nargs='?',
777        action=AppendSort,
778        help="Sort by this field, but backwards.")
779    parser.add_argument(
780        '-Y', '--summary',
781        action='store_true',
782        help="Only show the total.")
783    parser.add_argument(
784        '--int',
785        action='append',
786        help="Treat these fields as ints.")
787    parser.add_argument(
788        '--float',
789        action='append',
790        help="Treat these fields as floats.")
791    parser.add_argument(
792        '--frac',
793        action='append',
794        help="Treat these fields as fractions.")
795    parser.add_argument(
796        '--sum',
797        action='append',
798        help="Add these fields (the default).")
799    parser.add_argument(
800        '--prod',
801        action='append',
802        help="Multiply these fields.")
803    parser.add_argument(
804        '--min',
805        action='append',
806        help="Take the minimum of these fields.")
807    parser.add_argument(
808        '--max',
809        action='append',
810        help="Take the maximum of these fields.")
811    parser.add_argument(
812        '--mean',
813        action='append',
814        help="Average these fields.")
815    parser.add_argument(
816        '--stddev',
817        action='append',
818        help="Find the standard deviation of these fields.")
819    parser.add_argument(
820        '--gmean',
821        action='append',
822        help="Find the geometric mean of these fields.")
823    parser.add_argument(
824        '--gstddev',
825        action='append',
826        help="Find the geometric standard deviation of these fields.")
827    sys.exit(main(**{k: v
828        for k, v in vars(parser.parse_intermixed_args()).items()
829        if v is not None}))
830