1#!/usr/bin/env python3
2
3"""Assemble Mbed TLS change log entries into the change log file.
4
5Add changelog entries to the first level-2 section.
6Create a new level-2 section for unreleased changes if needed.
7Remove the input files unless --keep-entries is specified.
8
9In each level-3 section, entries are sorted in chronological order
10(oldest first). From oldest to newest:
11* Merged entry files are sorted according to their merge date (date of
12  the merge commit that brought the commit that created the file into
13  the target branch).
14* Committed but unmerged entry files are sorted according to the date
15  of the commit that adds them.
16* Uncommitted entry files are sorted according to their modification time.
17
18You must run this program from within a git working directory.
19"""
20
21# Copyright The Mbed TLS Contributors
22# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
23
24import argparse
25from collections import OrderedDict, namedtuple
26import datetime
27import functools
28import glob
29import os
30import re
31import subprocess
32import sys
33
34class InputFormatError(Exception):
35    def __init__(self, filename, line_number, message, *args, **kwargs):
36        message = '{}:{}: {}'.format(filename, line_number,
37                                     message.format(*args, **kwargs))
38        super().__init__(message)
39
40class CategoryParseError(Exception):
41    def __init__(self, line_offset, error_message):
42        self.line_offset = line_offset
43        self.error_message = error_message
44        super().__init__('{}: {}'.format(line_offset, error_message))
45
46class LostContent(Exception):
47    def __init__(self, filename, line):
48        message = ('Lost content from {}: "{}"'.format(filename, line))
49        super().__init__(message)
50
51# The category names we use in the changelog.
52# If you edit this, update ChangeLog.d/README.md.
53STANDARD_CATEGORIES = (
54    'API changes',
55    'Default behavior changes',
56    'Requirement changes',
57    'New deprecations',
58    'Removals',
59    'Features',
60    'Security',
61    'Bugfix',
62    'Changes',
63)
64
65# The maximum line length for an entry
66MAX_LINE_LENGTH = 80
67
68CategoryContent = namedtuple('CategoryContent', [
69    'name', 'title_line', # Title text and line number of the title
70    'body', 'body_line', # Body text and starting line number of the body
71])
72
73class ChangelogFormat:
74    """Virtual class documenting how to write a changelog format class."""
75
76    @classmethod
77    def extract_top_version(cls, changelog_file_content):
78        """Split out the top version section.
79
80        If the top version is already released, create a new top
81        version section for an unreleased version.
82
83        Return ``(header, top_version_title, top_version_body, trailer)``
84        where the "top version" is the existing top version section if it's
85        for unreleased changes, and a newly created section otherwise.
86        To assemble the changelog after modifying top_version_body,
87        concatenate the four pieces.
88        """
89        raise NotImplementedError
90
91    @classmethod
92    def version_title_text(cls, version_title):
93        """Return the text of a formatted version section title."""
94        raise NotImplementedError
95
96    @classmethod
97    def split_categories(cls, version_body):
98        """Split a changelog version section body into categories.
99
100        Return a list of `CategoryContent` the name is category title
101        without any formatting.
102        """
103        raise NotImplementedError
104
105    @classmethod
106    def format_category(cls, title, body):
107        """Construct the text of a category section from its title and body."""
108        raise NotImplementedError
109
110class TextChangelogFormat(ChangelogFormat):
111    """The traditional Mbed TLS changelog format."""
112
113    _unreleased_version_text = '= Mbed TLS x.x.x branch released xxxx-xx-xx'
114    @classmethod
115    def is_released_version(cls, title):
116        # Look for an incomplete release date
117        return not re.search(r'[0-9x]{4}-[0-9x]{2}-[0-9x]?x', title)
118
119    _top_version_re = re.compile(r'(?:\A|\n)(=[^\n]*\n+)(.*?\n)(?:=|$)',
120                                 re.DOTALL)
121    @classmethod
122    def extract_top_version(cls, changelog_file_content):
123        """A version section starts with a line starting with '='."""
124        m = re.search(cls._top_version_re, changelog_file_content)
125        top_version_start = m.start(1)
126        top_version_end = m.end(2)
127        top_version_title = m.group(1)
128        top_version_body = m.group(2)
129        if cls.is_released_version(top_version_title):
130            top_version_end = top_version_start
131            top_version_title = cls._unreleased_version_text + '\n\n'
132            top_version_body = ''
133        return (changelog_file_content[:top_version_start],
134                top_version_title, top_version_body,
135                changelog_file_content[top_version_end:])
136
137    @classmethod
138    def version_title_text(cls, version_title):
139        return re.sub(r'\n.*', version_title, re.DOTALL)
140
141    _category_title_re = re.compile(r'(^\w.*)\n+', re.MULTILINE)
142    @classmethod
143    def split_categories(cls, version_body):
144        """A category title is a line with the title in column 0."""
145        if not version_body:
146            return []
147        title_matches = list(re.finditer(cls._category_title_re, version_body))
148        if not title_matches or title_matches[0].start() != 0:
149            # There is junk before the first category.
150            raise CategoryParseError(0, 'Junk found where category expected')
151        title_starts = [m.start(1) for m in title_matches]
152        body_starts = [m.end(0) for m in title_matches]
153        body_ends = title_starts[1:] + [len(version_body)]
154        bodies = [version_body[body_start:body_end].rstrip('\n') + '\n'
155                  for (body_start, body_end) in zip(body_starts, body_ends)]
156        title_lines = [version_body[:pos].count('\n') for pos in title_starts]
157        body_lines = [version_body[:pos].count('\n') for pos in body_starts]
158        return [CategoryContent(title_match.group(1), title_line,
159                                body, body_line)
160                for title_match, title_line, body, body_line
161                in zip(title_matches, title_lines, bodies, body_lines)]
162
163    @classmethod
164    def format_category(cls, title, body):
165        # `split_categories` ensures that each body ends with a newline.
166        # Make sure that there is additionally a blank line between categories.
167        if not body.endswith('\n\n'):
168            body += '\n'
169        return title + '\n' + body
170
171class ChangeLog:
172    """An Mbed TLS changelog.
173
174    A changelog file consists of some header text followed by one or
175    more version sections. The version sections are in reverse
176    chronological order. Each version section consists of a title and a body.
177
178    The body of a version section consists of zero or more category
179    subsections. Each category subsection consists of a title and a body.
180
181    A changelog entry file has the same format as the body of a version section.
182
183    A `ChangelogFormat` object defines the concrete syntax of the changelog.
184    Entry files must have the same format as the changelog file.
185    """
186
187    # Only accept dotted version numbers (e.g. "3.1", not "3").
188    # Refuse ".x" in a version number where x is a letter: this indicates
189    # a version that is not yet released. Something like "3.1a" is accepted.
190    _version_number_re = re.compile(r'[0-9]+\.[0-9A-Za-z.]+')
191    _incomplete_version_number_re = re.compile(r'.*\.[A-Za-z]')
192    _only_url_re = re.compile(r'^\s*\w+://\S+\s*$')
193    _has_url_re = re.compile(r'.*://.*')
194
195    def add_categories_from_text(self, filename, line_offset,
196                                 text, allow_unknown_category):
197        """Parse a version section or entry file."""
198        try:
199            categories = self.format.split_categories(text)
200        except CategoryParseError as e:
201            raise InputFormatError(filename, line_offset + e.line_offset,
202                                   e.error_message)
203        for category in categories:
204            if not allow_unknown_category and \
205               category.name not in self.categories:
206                raise InputFormatError(filename,
207                                       line_offset + category.title_line,
208                                       'Unknown category: "{}"',
209                                       category.name)
210
211            body_split = category.body.splitlines()
212
213            for line_number, line in enumerate(body_split, 1):
214                if not self._only_url_re.match(line) and \
215                   len(line) > MAX_LINE_LENGTH:
216                    long_url_msg = '. URL exceeding length limit must be alone in its line.' \
217                        if self._has_url_re.match(line) else ""
218                    raise InputFormatError(filename,
219                                           category.body_line + line_number,
220                                           'Line is longer than allowed: '
221                                           'Length {} (Max {}){}',
222                                           len(line), MAX_LINE_LENGTH,
223                                           long_url_msg)
224
225            self.categories[category.name] += category.body
226
227    def __init__(self, input_stream, changelog_format):
228        """Create a changelog object.
229
230        Populate the changelog object from the content of the file
231        input_stream.
232        """
233        self.format = changelog_format
234        whole_file = input_stream.read()
235        (self.header,
236         self.top_version_title, top_version_body,
237         self.trailer) = self.format.extract_top_version(whole_file)
238        # Split the top version section into categories.
239        self.categories = OrderedDict()
240        for category in STANDARD_CATEGORIES:
241            self.categories[category] = ''
242        offset = (self.header + self.top_version_title).count('\n') + 1
243        self.add_categories_from_text(input_stream.name, offset,
244                                      top_version_body, True)
245
246    def add_file(self, input_stream):
247        """Add changelog entries from a file.
248        """
249        self.add_categories_from_text(input_stream.name, 1,
250                                      input_stream.read(), False)
251
252    def write(self, filename):
253        """Write the changelog to the specified file.
254        """
255        with open(filename, 'w', encoding='utf-8') as out:
256            out.write(self.header)
257            out.write(self.top_version_title)
258            for title, body in self.categories.items():
259                if not body:
260                    continue
261                out.write(self.format.format_category(title, body))
262            out.write(self.trailer)
263
264
265@functools.total_ordering
266class EntryFileSortKey:
267    """This classes defines an ordering on changelog entry files: older < newer.
268
269    * Merged entry files are sorted according to their merge date (date of
270      the merge commit that brought the commit that created the file into
271      the target branch).
272    * Committed but unmerged entry files are sorted according to the date
273      of the commit that adds them.
274    * Uncommitted entry files are sorted according to their modification time.
275
276    This class assumes that the file is in a git working directory with
277    the target branch checked out.
278    """
279
280    # Categories of files. A lower number is considered older.
281    MERGED = 0
282    COMMITTED = 1
283    LOCAL = 2
284
285    @staticmethod
286    def creation_hash(filename):
287        """Return the git commit id at which the given file was created.
288
289        Return None if the file was never checked into git.
290        """
291        hashes = subprocess.check_output(['git', 'log', '--format=%H',
292                                          '--follow',
293                                          '--', filename])
294        m = re.search('(.+)$', hashes.decode('ascii'))
295        if not m:
296            # The git output is empty. This means that the file was
297            # never checked in.
298            return None
299        # The last commit in the log is the oldest one, which is when the
300        # file was created.
301        return m.group(0)
302
303    @staticmethod
304    def list_merges(some_hash, target, *options):
305        """List merge commits from some_hash to target.
306
307        Pass options to git to select which commits are included.
308        """
309        text = subprocess.check_output(['git', 'rev-list',
310                                        '--merges', *options,
311                                        '..'.join([some_hash, target])])
312        return text.decode('ascii').rstrip('\n').split('\n')
313
314    @classmethod
315    def merge_hash(cls, some_hash):
316        """Return the git commit id at which the given commit was merged.
317
318        Return None if the given commit was never merged.
319        """
320        target = 'HEAD'
321        # List the merges from some_hash to the target in two ways.
322        # The ancestry list is the ones that are both descendants of
323        # some_hash and ancestors of the target.
324        ancestry = frozenset(cls.list_merges(some_hash, target,
325                                             '--ancestry-path'))
326        # The first_parents list only contains merges that are directly
327        # on the target branch. We want it in reverse order (oldest first).
328        first_parents = cls.list_merges(some_hash, target,
329                                        '--first-parent', '--reverse')
330        # Look for the oldest merge commit that's both on the direct path
331        # and directly on the target branch. That's the place where some_hash
332        # was merged on the target branch. See
333        # https://stackoverflow.com/questions/8475448/find-merge-commit-which-include-a-specific-commit
334        for commit in first_parents:
335            if commit in ancestry:
336                return commit
337        return None
338
339    @staticmethod
340    def commit_timestamp(commit_id):
341        """Return the timestamp of the given commit."""
342        text = subprocess.check_output(['git', 'show', '-s',
343                                        '--format=%ct',
344                                        commit_id])
345        return datetime.datetime.utcfromtimestamp(int(text))
346
347    @staticmethod
348    def file_timestamp(filename):
349        """Return the modification timestamp of the given file."""
350        mtime = os.stat(filename).st_mtime
351        return datetime.datetime.fromtimestamp(mtime)
352
353    def __init__(self, filename):
354        """Determine position of the file in the changelog entry order.
355
356        This constructor returns an object that can be used with comparison
357        operators, with `sort` and `sorted`, etc. Older entries are sorted
358        before newer entries.
359        """
360        self.filename = filename
361        creation_hash = self.creation_hash(filename)
362        if not creation_hash:
363            self.category = self.LOCAL
364            self.datetime = self.file_timestamp(filename)
365            return
366        merge_hash = self.merge_hash(creation_hash)
367        if not merge_hash:
368            self.category = self.COMMITTED
369            self.datetime = self.commit_timestamp(creation_hash)
370            return
371        self.category = self.MERGED
372        self.datetime = self.commit_timestamp(merge_hash)
373
374    def sort_key(self):
375        """"Return a concrete sort key for this entry file sort key object.
376
377        ``ts1 < ts2`` is implemented as ``ts1.sort_key() < ts2.sort_key()``.
378        """
379        return (self.category, self.datetime, self.filename)
380
381    def __eq__(self, other):
382        return self.sort_key() == other.sort_key()
383
384    def __lt__(self, other):
385        return self.sort_key() < other.sort_key()
386
387
388def check_output(generated_output_file, main_input_file, merged_files):
389    """Make sanity checks on the generated output.
390
391    The intent of these sanity checks is to have reasonable confidence
392    that no content has been lost.
393
394    The sanity check is that every line that is present in an input file
395    is also present in an output file. This is not perfect but good enough
396    for now.
397    """
398    with open(generated_output_file, 'r', encoding='utf-8') as fd:
399        generated_output = set(fd)
400        for line in open(main_input_file, 'r', encoding='utf-8'):
401            if line not in generated_output:
402                raise LostContent('original file', line)
403        for merged_file in merged_files:
404            for line in open(merged_file, 'r', encoding='utf-8'):
405                if line not in generated_output:
406                    raise LostContent(merged_file, line)
407
408def finish_output(changelog, output_file, input_file, merged_files):
409    """Write the changelog to the output file.
410
411    The input file and the list of merged files are used only for sanity
412    checks on the output.
413    """
414    if os.path.exists(output_file) and not os.path.isfile(output_file):
415        # The output is a non-regular file (e.g. pipe). Write to it directly.
416        output_temp = output_file
417    else:
418        # The output is a regular file. Write to a temporary file,
419        # then move it into place atomically.
420        output_temp = output_file + '.tmp'
421    changelog.write(output_temp)
422    check_output(output_temp, input_file, merged_files)
423    if output_temp != output_file:
424        os.rename(output_temp, output_file)
425
426def remove_merged_entries(files_to_remove):
427    for filename in files_to_remove:
428        os.remove(filename)
429
430def list_files_to_merge(options):
431    """List the entry files to merge, oldest first.
432
433    "Oldest" is defined by `EntryFileSortKey`.
434    """
435    files_to_merge = glob.glob(os.path.join(options.dir, '*.txt'))
436    files_to_merge.sort(key=EntryFileSortKey)
437    return files_to_merge
438
439def merge_entries(options):
440    """Merge changelog entries into the changelog file.
441
442    Read the changelog file from options.input.
443    Read entries to merge from the directory options.dir.
444    Write the new changelog to options.output.
445    Remove the merged entries if options.keep_entries is false.
446    """
447    with open(options.input, 'r', encoding='utf-8') as input_file:
448        changelog = ChangeLog(input_file, TextChangelogFormat)
449    files_to_merge = list_files_to_merge(options)
450    if not files_to_merge:
451        sys.stderr.write('There are no pending changelog entries.\n')
452        return
453    for filename in files_to_merge:
454        with open(filename, 'r', encoding='utf-8') as input_file:
455            changelog.add_file(input_file)
456    finish_output(changelog, options.output, options.input, files_to_merge)
457    if not options.keep_entries:
458        remove_merged_entries(files_to_merge)
459
460def show_file_timestamps(options):
461    """List the files to merge and their timestamp.
462
463    This is only intended for debugging purposes.
464    """
465    files = list_files_to_merge(options)
466    for filename in files:
467        ts = EntryFileSortKey(filename)
468        print(ts.category, ts.datetime, filename)
469
470def set_defaults(options):
471    """Add default values for missing options."""
472    output_file = getattr(options, 'output', None)
473    if output_file is None:
474        options.output = options.input
475    if getattr(options, 'keep_entries', None) is None:
476        options.keep_entries = (output_file is not None)
477
478def main():
479    """Command line entry point."""
480    parser = argparse.ArgumentParser(description=__doc__)
481    parser.add_argument('--dir', '-d', metavar='DIR',
482                        default='ChangeLog.d',
483                        help='Directory to read entries from'
484                             ' (default: ChangeLog.d)')
485    parser.add_argument('--input', '-i', metavar='FILE',
486                        default='ChangeLog',
487                        help='Existing changelog file to read from and augment'
488                             ' (default: ChangeLog)')
489    parser.add_argument('--keep-entries',
490                        action='store_true', dest='keep_entries', default=None,
491                        help='Keep the files containing entries'
492                             ' (default: remove them if --output/-o is not specified)')
493    parser.add_argument('--no-keep-entries',
494                        action='store_false', dest='keep_entries',
495                        help='Remove the files containing entries after they are merged'
496                             ' (default: remove them if --output/-o is not specified)')
497    parser.add_argument('--output', '-o', metavar='FILE',
498                        help='Output changelog file'
499                             ' (default: overwrite the input)')
500    parser.add_argument('--list-files-only',
501                        action='store_true',
502                        help=('Only list the files that would be processed '
503                              '(with some debugging information)'))
504    options = parser.parse_args()
505    set_defaults(options)
506    if options.list_files_only:
507        show_file_timestamps(options)
508        return
509    merge_entries(options)
510
511if __name__ == '__main__':
512    main()
513