1#!/usr/bin/env python3
2
3"""Assemble Mbed TLS change log entries into the change log file.
4
5Add changelog entries to the first level-2 section.
6Create a new level-2 section for unreleased changes if needed.
7Remove the input files unless --keep-entries is specified.
8
9In each level-3 section, entries are sorted in chronological order
10(oldest first). From oldest to newest:
11* Merged entry files are sorted according to their merge date (date of
12  the merge commit that brought the commit that created the file into
13  the target branch).
14* Committed but unmerged entry files are sorted according to the date
15  of the commit that adds them.
16* Uncommitted entry files are sorted according to their modification time.
17
18You must run this program from within a git working directory.
19"""
20
21# Copyright The Mbed TLS Contributors
22# SPDX-License-Identifier: Apache-2.0
23#
24# Licensed under the Apache License, Version 2.0 (the "License"); you may
25# not use this file except in compliance with the License.
26# You may obtain a copy of the License at
27#
28# http://www.apache.org/licenses/LICENSE-2.0
29#
30# Unless required by applicable law or agreed to in writing, software
31# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
32# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
33# See the License for the specific language governing permissions and
34# limitations under the License.
35
36import argparse
37from collections import OrderedDict, namedtuple
38import datetime
39import functools
40import glob
41import os
42import re
43import subprocess
44import sys
45
46class InputFormatError(Exception):
47    def __init__(self, filename, line_number, message, *args, **kwargs):
48        message = '{}:{}: {}'.format(filename, line_number,
49                                     message.format(*args, **kwargs))
50        super().__init__(message)
51
52class CategoryParseError(Exception):
53    def __init__(self, line_offset, error_message):
54        self.line_offset = line_offset
55        self.error_message = error_message
56        super().__init__('{}: {}'.format(line_offset, error_message))
57
58class LostContent(Exception):
59    def __init__(self, filename, line):
60        message = ('Lost content from {}: "{}"'.format(filename, line))
61        super().__init__(message)
62
63# The category names we use in the changelog.
64# If you edit this, update ChangeLog.d/README.md.
65STANDARD_CATEGORIES = (
66    'API changes',
67    'Default behavior changes',
68    'Requirement changes',
69    'New deprecations',
70    'Removals',
71    'Features',
72    'Security',
73    'Bugfix',
74    'Changes',
75)
76
77# The maximum line length for an entry
78MAX_LINE_LENGTH = 80
79
80CategoryContent = namedtuple('CategoryContent', [
81    'name', 'title_line', # Title text and line number of the title
82    'body', 'body_line', # Body text and starting line number of the body
83])
84
85class ChangelogFormat:
86    """Virtual class documenting how to write a changelog format class."""
87
88    @classmethod
89    def extract_top_version(cls, changelog_file_content):
90        """Split out the top version section.
91
92        If the top version is already released, create a new top
93        version section for an unreleased version.
94
95        Return ``(header, top_version_title, top_version_body, trailer)``
96        where the "top version" is the existing top version section if it's
97        for unreleased changes, and a newly created section otherwise.
98        To assemble the changelog after modifying top_version_body,
99        concatenate the four pieces.
100        """
101        raise NotImplementedError
102
103    @classmethod
104    def version_title_text(cls, version_title):
105        """Return the text of a formatted version section title."""
106        raise NotImplementedError
107
108    @classmethod
109    def split_categories(cls, version_body):
110        """Split a changelog version section body into categories.
111
112        Return a list of `CategoryContent` the name is category title
113        without any formatting.
114        """
115        raise NotImplementedError
116
117    @classmethod
118    def format_category(cls, title, body):
119        """Construct the text of a category section from its title and body."""
120        raise NotImplementedError
121
122class TextChangelogFormat(ChangelogFormat):
123    """The traditional Mbed TLS changelog format."""
124
125    _unreleased_version_text = '= Mbed TLS x.x.x branch released xxxx-xx-xx'
126    @classmethod
127    def is_released_version(cls, title):
128        # Look for an incomplete release date
129        return not re.search(r'[0-9x]{4}-[0-9x]{2}-[0-9x]?x', title)
130
131    _top_version_re = re.compile(r'(?:\A|\n)(=[^\n]*\n+)(.*?\n)(?:=|$)',
132                                 re.DOTALL)
133    @classmethod
134    def extract_top_version(cls, changelog_file_content):
135        """A version section starts with a line starting with '='."""
136        m = re.search(cls._top_version_re, changelog_file_content)
137        top_version_start = m.start(1)
138        top_version_end = m.end(2)
139        top_version_title = m.group(1)
140        top_version_body = m.group(2)
141        if cls.is_released_version(top_version_title):
142            top_version_end = top_version_start
143            top_version_title = cls._unreleased_version_text + '\n\n'
144            top_version_body = ''
145        return (changelog_file_content[:top_version_start],
146                top_version_title, top_version_body,
147                changelog_file_content[top_version_end:])
148
149    @classmethod
150    def version_title_text(cls, version_title):
151        return re.sub(r'\n.*', version_title, re.DOTALL)
152
153    _category_title_re = re.compile(r'(^\w.*)\n+', re.MULTILINE)
154    @classmethod
155    def split_categories(cls, version_body):
156        """A category title is a line with the title in column 0."""
157        if not version_body:
158            return []
159        title_matches = list(re.finditer(cls._category_title_re, version_body))
160        if not title_matches or title_matches[0].start() != 0:
161            # There is junk before the first category.
162            raise CategoryParseError(0, 'Junk found where category expected')
163        title_starts = [m.start(1) for m in title_matches]
164        body_starts = [m.end(0) for m in title_matches]
165        body_ends = title_starts[1:] + [len(version_body)]
166        bodies = [version_body[body_start:body_end].rstrip('\n') + '\n'
167                  for (body_start, body_end) in zip(body_starts, body_ends)]
168        title_lines = [version_body[:pos].count('\n') for pos in title_starts]
169        body_lines = [version_body[:pos].count('\n') for pos in body_starts]
170        return [CategoryContent(title_match.group(1), title_line,
171                                body, body_line)
172                for title_match, title_line, body, body_line
173                in zip(title_matches, title_lines, bodies, body_lines)]
174
175    @classmethod
176    def format_category(cls, title, body):
177        # `split_categories` ensures that each body ends with a newline.
178        # Make sure that there is additionally a blank line between categories.
179        if not body.endswith('\n\n'):
180            body += '\n'
181        return title + '\n' + body
182
183class ChangeLog:
184    """An Mbed TLS changelog.
185
186    A changelog file consists of some header text followed by one or
187    more version sections. The version sections are in reverse
188    chronological order. Each version section consists of a title and a body.
189
190    The body of a version section consists of zero or more category
191    subsections. Each category subsection consists of a title and a body.
192
193    A changelog entry file has the same format as the body of a version section.
194
195    A `ChangelogFormat` object defines the concrete syntax of the changelog.
196    Entry files must have the same format as the changelog file.
197    """
198
199    # Only accept dotted version numbers (e.g. "3.1", not "3").
200    # Refuse ".x" in a version number where x is a letter: this indicates
201    # a version that is not yet released. Something like "3.1a" is accepted.
202    _version_number_re = re.compile(r'[0-9]+\.[0-9A-Za-z.]+')
203    _incomplete_version_number_re = re.compile(r'.*\.[A-Za-z]')
204    _only_url_re = re.compile(r'^\s*\w+://\S+\s*$')
205    _has_url_re = re.compile(r'.*://.*')
206
207    def add_categories_from_text(self, filename, line_offset,
208                                 text, allow_unknown_category):
209        """Parse a version section or entry file."""
210        try:
211            categories = self.format.split_categories(text)
212        except CategoryParseError as e:
213            raise InputFormatError(filename, line_offset + e.line_offset,
214                                   e.error_message)
215        for category in categories:
216            if not allow_unknown_category and \
217               category.name not in self.categories:
218                raise InputFormatError(filename,
219                                       line_offset + category.title_line,
220                                       'Unknown category: "{}"',
221                                       category.name)
222
223            body_split = category.body.splitlines()
224
225            for line_number, line in enumerate(body_split, 1):
226                if not self._only_url_re.match(line) and \
227                   len(line) > MAX_LINE_LENGTH:
228                    long_url_msg = '. URL exceeding length limit must be alone in its line.' \
229                        if self._has_url_re.match(line) else ""
230                    raise InputFormatError(filename,
231                                           category.body_line + line_number,
232                                           'Line is longer than allowed: '
233                                           'Length {} (Max {}){}',
234                                           len(line), MAX_LINE_LENGTH,
235                                           long_url_msg)
236
237            self.categories[category.name] += category.body
238
239    def __init__(self, input_stream, changelog_format):
240        """Create a changelog object.
241
242        Populate the changelog object from the content of the file
243        input_stream.
244        """
245        self.format = changelog_format
246        whole_file = input_stream.read()
247        (self.header,
248         self.top_version_title, top_version_body,
249         self.trailer) = self.format.extract_top_version(whole_file)
250        # Split the top version section into categories.
251        self.categories = OrderedDict()
252        for category in STANDARD_CATEGORIES:
253            self.categories[category] = ''
254        offset = (self.header + self.top_version_title).count('\n') + 1
255        self.add_categories_from_text(input_stream.name, offset,
256                                      top_version_body, True)
257
258    def add_file(self, input_stream):
259        """Add changelog entries from a file.
260        """
261        self.add_categories_from_text(input_stream.name, 1,
262                                      input_stream.read(), False)
263
264    def write(self, filename):
265        """Write the changelog to the specified file.
266        """
267        with open(filename, 'w', encoding='utf-8') as out:
268            out.write(self.header)
269            out.write(self.top_version_title)
270            for title, body in self.categories.items():
271                if not body:
272                    continue
273                out.write(self.format.format_category(title, body))
274            out.write(self.trailer)
275
276
277@functools.total_ordering
278class EntryFileSortKey:
279    """This classes defines an ordering on changelog entry files: older < newer.
280
281    * Merged entry files are sorted according to their merge date (date of
282      the merge commit that brought the commit that created the file into
283      the target branch).
284    * Committed but unmerged entry files are sorted according to the date
285      of the commit that adds them.
286    * Uncommitted entry files are sorted according to their modification time.
287
288    This class assumes that the file is in a git working directory with
289    the target branch checked out.
290    """
291
292    # Categories of files. A lower number is considered older.
293    MERGED = 0
294    COMMITTED = 1
295    LOCAL = 2
296
297    @staticmethod
298    def creation_hash(filename):
299        """Return the git commit id at which the given file was created.
300
301        Return None if the file was never checked into git.
302        """
303        hashes = subprocess.check_output(['git', 'log', '--format=%H',
304                                          '--follow',
305                                          '--', filename])
306        m = re.search('(.+)$', hashes.decode('ascii'))
307        if not m:
308            # The git output is empty. This means that the file was
309            # never checked in.
310            return None
311        # The last commit in the log is the oldest one, which is when the
312        # file was created.
313        return m.group(0)
314
315    @staticmethod
316    def list_merges(some_hash, target, *options):
317        """List merge commits from some_hash to target.
318
319        Pass options to git to select which commits are included.
320        """
321        text = subprocess.check_output(['git', 'rev-list',
322                                        '--merges', *options,
323                                        '..'.join([some_hash, target])])
324        return text.decode('ascii').rstrip('\n').split('\n')
325
326    @classmethod
327    def merge_hash(cls, some_hash):
328        """Return the git commit id at which the given commit was merged.
329
330        Return None if the given commit was never merged.
331        """
332        target = 'HEAD'
333        # List the merges from some_hash to the target in two ways.
334        # The ancestry list is the ones that are both descendants of
335        # some_hash and ancestors of the target.
336        ancestry = frozenset(cls.list_merges(some_hash, target,
337                                             '--ancestry-path'))
338        # The first_parents list only contains merges that are directly
339        # on the target branch. We want it in reverse order (oldest first).
340        first_parents = cls.list_merges(some_hash, target,
341                                        '--first-parent', '--reverse')
342        # Look for the oldest merge commit that's both on the direct path
343        # and directly on the target branch. That's the place where some_hash
344        # was merged on the target branch. See
345        # https://stackoverflow.com/questions/8475448/find-merge-commit-which-include-a-specific-commit
346        for commit in first_parents:
347            if commit in ancestry:
348                return commit
349        return None
350
351    @staticmethod
352    def commit_timestamp(commit_id):
353        """Return the timestamp of the given commit."""
354        text = subprocess.check_output(['git', 'show', '-s',
355                                        '--format=%ct',
356                                        commit_id])
357        return datetime.datetime.utcfromtimestamp(int(text))
358
359    @staticmethod
360    def file_timestamp(filename):
361        """Return the modification timestamp of the given file."""
362        mtime = os.stat(filename).st_mtime
363        return datetime.datetime.fromtimestamp(mtime)
364
365    def __init__(self, filename):
366        """Determine position of the file in the changelog entry order.
367
368        This constructor returns an object that can be used with comparison
369        operators, with `sort` and `sorted`, etc. Older entries are sorted
370        before newer entries.
371        """
372        self.filename = filename
373        creation_hash = self.creation_hash(filename)
374        if not creation_hash:
375            self.category = self.LOCAL
376            self.datetime = self.file_timestamp(filename)
377            return
378        merge_hash = self.merge_hash(creation_hash)
379        if not merge_hash:
380            self.category = self.COMMITTED
381            self.datetime = self.commit_timestamp(creation_hash)
382            return
383        self.category = self.MERGED
384        self.datetime = self.commit_timestamp(merge_hash)
385
386    def sort_key(self):
387        """"Return a concrete sort key for this entry file sort key object.
388
389        ``ts1 < ts2`` is implemented as ``ts1.sort_key() < ts2.sort_key()``.
390        """
391        return (self.category, self.datetime, self.filename)
392
393    def __eq__(self, other):
394        return self.sort_key() == other.sort_key()
395
396    def __lt__(self, other):
397        return self.sort_key() < other.sort_key()
398
399
400def check_output(generated_output_file, main_input_file, merged_files):
401    """Make sanity checks on the generated output.
402
403    The intent of these sanity checks is to have reasonable confidence
404    that no content has been lost.
405
406    The sanity check is that every line that is present in an input file
407    is also present in an output file. This is not perfect but good enough
408    for now.
409    """
410    with open(generated_output_file, 'r', encoding='utf-8') as fd:
411        generated_output = set(fd)
412        for line in open(main_input_file, 'r', encoding='utf-8'):
413            if line not in generated_output:
414                raise LostContent('original file', line)
415        for merged_file in merged_files:
416            for line in open(merged_file, 'r', encoding='utf-8'):
417                if line not in generated_output:
418                    raise LostContent(merged_file, line)
419
420def finish_output(changelog, output_file, input_file, merged_files):
421    """Write the changelog to the output file.
422
423    The input file and the list of merged files are used only for sanity
424    checks on the output.
425    """
426    if os.path.exists(output_file) and not os.path.isfile(output_file):
427        # The output is a non-regular file (e.g. pipe). Write to it directly.
428        output_temp = output_file
429    else:
430        # The output is a regular file. Write to a temporary file,
431        # then move it into place atomically.
432        output_temp = output_file + '.tmp'
433    changelog.write(output_temp)
434    check_output(output_temp, input_file, merged_files)
435    if output_temp != output_file:
436        os.rename(output_temp, output_file)
437
438def remove_merged_entries(files_to_remove):
439    for filename in files_to_remove:
440        os.remove(filename)
441
442def list_files_to_merge(options):
443    """List the entry files to merge, oldest first.
444
445    "Oldest" is defined by `EntryFileSortKey`.
446    """
447    files_to_merge = glob.glob(os.path.join(options.dir, '*.txt'))
448    files_to_merge.sort(key=EntryFileSortKey)
449    return files_to_merge
450
451def merge_entries(options):
452    """Merge changelog entries into the changelog file.
453
454    Read the changelog file from options.input.
455    Read entries to merge from the directory options.dir.
456    Write the new changelog to options.output.
457    Remove the merged entries if options.keep_entries is false.
458    """
459    with open(options.input, 'r', encoding='utf-8') as input_file:
460        changelog = ChangeLog(input_file, TextChangelogFormat)
461    files_to_merge = list_files_to_merge(options)
462    if not files_to_merge:
463        sys.stderr.write('There are no pending changelog entries.\n')
464        return
465    for filename in files_to_merge:
466        with open(filename, 'r', encoding='utf-8') as input_file:
467            changelog.add_file(input_file)
468    finish_output(changelog, options.output, options.input, files_to_merge)
469    if not options.keep_entries:
470        remove_merged_entries(files_to_merge)
471
472def show_file_timestamps(options):
473    """List the files to merge and their timestamp.
474
475    This is only intended for debugging purposes.
476    """
477    files = list_files_to_merge(options)
478    for filename in files:
479        ts = EntryFileSortKey(filename)
480        print(ts.category, ts.datetime, filename)
481
482def set_defaults(options):
483    """Add default values for missing options."""
484    output_file = getattr(options, 'output', None)
485    if output_file is None:
486        options.output = options.input
487    if getattr(options, 'keep_entries', None) is None:
488        options.keep_entries = (output_file is not None)
489
490def main():
491    """Command line entry point."""
492    parser = argparse.ArgumentParser(description=__doc__)
493    parser.add_argument('--dir', '-d', metavar='DIR',
494                        default='ChangeLog.d',
495                        help='Directory to read entries from'
496                             ' (default: ChangeLog.d)')
497    parser.add_argument('--input', '-i', metavar='FILE',
498                        default='ChangeLog',
499                        help='Existing changelog file to read from and augment'
500                             ' (default: ChangeLog)')
501    parser.add_argument('--keep-entries',
502                        action='store_true', dest='keep_entries', default=None,
503                        help='Keep the files containing entries'
504                             ' (default: remove them if --output/-o is not specified)')
505    parser.add_argument('--no-keep-entries',
506                        action='store_false', dest='keep_entries',
507                        help='Remove the files containing entries after they are merged'
508                             ' (default: remove them if --output/-o is not specified)')
509    parser.add_argument('--output', '-o', metavar='FILE',
510                        help='Output changelog file'
511                             ' (default: overwrite the input)')
512    parser.add_argument('--list-files-only',
513                        action='store_true',
514                        help=('Only list the files that would be processed '
515                              '(with some debugging information)'))
516    options = parser.parse_args()
517    set_defaults(options)
518    if options.list_files_only:
519        show_file_timestamps(options)
520        return
521    merge_entries(options)
522
523if __name__ == '__main__':
524    main()
525