1#!/usr/bin/perl
2# Copyright (c) 2018 Markus Kuhn
3# Uniset -- Unicode subset manager -- Markus Kuhn
4# http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
5
6require 5.014;
7use open ':utf8';
8use FindBin qw($RealBin);  # to find directory where this file is located
9
10binmode(STDOUT, ":utf8");
11binmode(STDIN, ":utf8");
12
13my (%name, %invname, %category, %comment);
14
15print <<End if $#ARGV < 0;
16Uniset -- Unicode subset manager -- Markus Kuhn
17
18Uniset merges and subtracts Unicode subsets. It can output and
19analyse the resulting character set in various formats.
20
21Uniset understand the following command-line arguments:
22
23Commands to define a set of characters:
24
25  + filename   add the character set described in the file to the set
26  - filename   remove the character set described in the file from the set
27  +: filename  add the characters in the UTF-8 file to the set
28  -: filename  remove the characters in the UTF-8 file from the set
29  +xxxx..yyyy  add the range to the set (xxxx and yyyy are hex numbers)
30  -xxxx..yyyy  remove the range from the set (xxxx and yyyy are hex numbers)
31  +cat=Xx      add all Unicode characters with category code Xx
32  -cat=Xx      remove all Unicode characters with category code Xx
33  -cat!=Xx     remove all Unicode characters without category code Xx
34  clean        remove any elements that do not appear in the Unicode database
35  unknown      remove any elements that do appear in the Unicode database
36
37Command to output descriptions of the constructed set of characters:
38
39  table        write a full table with one line per character
40  compact      output the set in compact MES format
41  c            output the set as C interval array
42  nr           output the number of characters
43  sources      output a table that shows the number of characters contributed
44               by the various combinations of input sets added with +.
45  utf8-list    output a list of all characters encoded in UTF-8
46
47Commands to tailor the following output commands:
48
49  html         write HTML tables instead of plain text
50  ucs          add the unicode character itself to the table (UTF-8 in
51               plain table, numeric character reference in HTML)
52
53Formats of character set input files read by the + and - command:
54
55Empty lines, white space at the start and end of the line and any
56comment text following a \# are ignored. The following formats are
57recognized
58
59xx yyyy             xx is the hex code in an 8-bit character set and yyyy
60                    is the corresponding Unicode value. Both can optionally
61                    be prefixed by 0x. This is the format used in the
62                    files on <ftp://ftp.unicode.org/Public/MAPPINGS/>.
63
64yyyy                yyyy (optionally prefixed with 0x) is a Unicode character
65                    belonging to the specified subset.
66
67yyyy-yyyy           a range of Unicode characters belonging to
68yyyy..yyyy          the specified subset.
69
70xx yy yy yy-yy yy   xx denotes a row (high-byte) and the yy specify
71                    corresponding low bytes or with a hyphen also ranges of
72                    low bytes in the Unicode values that belong to this
73                    subset. This is also the format that is generated by
74                    the compact command.
75End
76exit 1 if $#ARGV < 0;
77
78
79# Subroutine to identify whether the ISO 10646/Unicode character code
80# ucs belongs into the East Asian Wide (W) or East Asian FullWidth
81# (F) category as defined in Unicode Technical Report #11.
82
83sub iswide ($) {
84    my $ucs = shift(@_);
85
86    return ($ucs >= 0x1100 &&
87	    ($ucs <= 0x115f ||                     # Hangul Jamo
88	     $ucs == 0x2329 || $ucs == 0x232a ||
89	     ($ucs >= 0x2e80 && $ucs <= 0xa4cf &&
90	      $ucs != 0x303f) ||                   # CJK .. Yi
91	     ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables
92	     ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs
93	     ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms
94	     ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms
95	     ($ucs >= 0xffe0 && $ucs <= 0xffe6) ||
96	     ($ucs >= 0x20000 && $ucs <= 0x2fffd) ||
97	     ($ucs >= 0x30000 && $ucs <= 0x3fffd)));
98}
99
100# Return the Unicode name that belongs to a given character code
101
102# Jamo short names, see Unicode 3.0, table 4-4, page 86
103
104my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '',
105	     'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112
106my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O',
107	     'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU',
108	     'EU', 'YI', 'I'); # 1161..1175
109my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM',
110	     'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS',
111	     'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2
112
113sub name {
114    my $ucs = shift(@_);
115
116    # The intervals used here reflect Unicode Version 3.2
117    if (($ucs >=  0x3400 && $ucs <=  0x4db5) ||
118	($ucs >=  0x4e00 && $ucs <=  0x9fa5) ||
119	($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
120	return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs);
121    }
122
123    if ($ucs >= 0xac00 && $ucs <= 0xd7a3) {
124	my $s = $ucs - 0xac00;
125	my $l = 0x1100 + int($s / (21 * 28));
126	my $v = 0x1161 + int(($s % (21 * 28)) / 28);
127	my $t = 0x11a7 + $s % 28;
128	return "HANGUL SYLLABLE " .
129	    ($lname[int($s / (21 * 28))] .
130	     $vname[int(($s % (21 * 28)) / 28)] .
131	     $tname[$s % 28 - 1]);
132    }
133
134    return $name{$ucs};
135}
136
137sub is_unicode {
138    my $ucs = shift(@_);
139
140    # The intervals used here reflect Unicode Version 3.2
141    if (($ucs >=  0x3400 && $ucs <=  0x4db5) ||
142	($ucs >=  0x4e00 && $ucs <=  0x9fa5) ||
143	($ucs >=  0xac00 && $ucs <=  0xd7a3) ||
144	($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
145	return 1;
146    }
147
148    return exists $name{$ucs};
149}
150
151my @search_path = ();
152if ($RealBin =~ m|^(.*)/bin\z| && -d "$1/share/uniset") {
153    push @search_path, "$1/share/uniset";
154} else {
155    push @search_path, $RealBin;
156}
157
158sub search_open {
159    my ($mode, $fn) = @_;
160    my $file;
161    return $file if open($file, $mode, $fn);
162    return undef if $fn =~ m|/|;
163    for my $path (@search_path) {
164	return $file if open($file, $mode, "$path/$fn");
165    }
166    return undef;
167}
168
169my $html = 0;
170my $image = 0;
171my $adducs = 0;
172my $unicodedata = "UnicodeData.txt";
173my $blockdata = "Blocks.txt";
174
175# read list of all Unicode names
176my $data = search_open('<', $unicodedata);
177unless ($data) {
178    die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
179	 "Please make sure that you have downloaded the file\n" .
180	 "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
181}
182while (<$data>) {
183    if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
184	next if $2 ne '<control>' && substr($2, 0, 1) eq '<';
185	$ucs = hex($1);
186        $name{$ucs} = $2;
187	$invname{$2} = $ucs;
188	$category{$ucs} = $3;
189        $comment{$ucs} = $12;
190    } else {
191        die("Syntax error in line '$_' in file '$unicodedata'\n");
192    }
193}
194close($data);
195
196# read list of all Unicode blocks
197$data = search_open('<', $blockdata);
198unless ($data) {
199    die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" .
200	 "Please make sure that you have downloaded the file\n" .
201	 "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n");
202}
203my $blocks = 0;
204my (@blockstart, @blockend, @blockname);
205while (<$data>) {
206    if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) {
207        $blockstart[$blocks] = hex($1);
208	$blockend  [$blocks] = hex($2);
209        $blockname [$blocks] = $3;
210	$blocks++;
211    } elsif (/^\s*\#/ || /^\s*$/) {
212	# ignore comments and empty lines
213    } else {
214        die("Syntax error in line '$_' in file '$blockdata'\n");
215    }
216}
217close($data);
218if ($blockend[$blocks-1] < 0x110000) {
219    $blockstart[$blocks] = 0x110000;
220    $blockend  [$blocks] = 0x7FFFFFFF;
221    $blockname [$blocks] = "Beyond Plane 16";
222    $blocks++;
223}
224
225# process command line arguments
226while ($_ = shift(@ARGV)) {
227    if (/^html$/) {
228	$html = 1;
229    } elsif (/^ucs$/) {
230	$adducs = 1;
231    } elsif (/^img$/) {
232	$html = 1;
233	$image = 1;
234    } elsif (/^template$/) {
235	$template = shift(@ARGV);
236	open(TEMPLATE, $template) || die("Can't open template file '$template': $!\n");
237	while (<TEMPLATE>) {
238	    if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) {
239		open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n");
240		while (<INCLUDE>) {
241		    print $_;
242		}
243		close(INCLUDE);
244	    } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) {
245		open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n");
246		while (<INCLUDE>) {
247		    s/&/&amp;/g;
248		    s/</&lt;/g;
249		    print $_;
250		}
251		close(INCLUDE);
252	    } else {
253		print $_;
254	    }
255	}
256	close(TEMPLATE);
257    } elsif (/^\+cat=(.+)$/) {
258	# add characters with given category
259	$cat = $1;
260	for $i (keys(%category)) {
261	    $used{$i} = "[${cat}]" if $category{$i} eq $cat;
262	}
263    } elsif (/^\-cat=(.+)$/) {
264	# remove characters with given category
265	$cat = $1;
266	for $i (keys(%category)) {
267	    delete $used{$i} if $category{$i} eq $cat;
268	}
269    } elsif (/^\-cat!=(.+)$/) {
270	# remove characters without given category
271	$cat = $1;
272	for $i (keys(%category)) {
273	    delete $used{$i} unless $category{$i} eq $cat;
274	}
275    } elsif (/^([+-]):(.*)/) {
276	$remove = $1 eq "-";
277	$setfile = $2;
278	$setfile = shift(@ARGV) if $setfile eq "";
279	push(@SETS, $setfile);
280	open(SET, $setfile) || die("Can't open set file '$setfile': $!\n");
281	$setname = $setfile;
282	while (<SET>) {
283	    while ($_) {
284		$i = ord($_);
285		$used{$i} .= "[${setname}]" unless $remove;
286		delete $used{$i} if $remove;
287		$_ = substr($_, 1);
288	    }
289	}
290	close SET;
291    } elsif (/^([+-])(.*)/) {
292	$remove = $1 eq "-";
293	$setfile = $2;
294	$setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/;
295	if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) {
296	    # handle intervall specification on command line
297	    $first = hex($1);
298	    $last = hex($3);
299	    for ($i = $first; $i <= $last; $i++) {
300		$used{$i} .= "[ARG]" unless $remove;
301		delete $used{$i} if $remove;
302	    }
303	    next;
304	}
305	$setfile = shift(@ARGV) if $setfile eq "";
306	push(@SETS, $setfile);
307	my $setf = search_open('<', $setfile);
308	die("Can't open set file '$setfile': $!\n") unless $setf;
309	$cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name
310	$setname = $setfile;
311	$setname =~ s/([^.\[\]]*)\..*/$1/;
312	while (<$setf>) {
313	    if (/^<code_set_name>/) {
314		# handle ISO 15897 (POSIX registry) charset mapping format
315		undef $comment_char;
316		undef $escape_char;
317		while (<$setf>) {
318		    if ($comment_char && /^$comment_char/) {
319			# remove comments
320			$_ = $`;
321		    }
322		    next if (/^\032?\s*$/);                                             # skip empty lines
323		    if (/^<comment_char> (\S)$/) {
324			$comment_char = $1;
325		    } elsif (/^<escape_char> (\S)$/) {
326			$escape_char = $1;
327		    } elsif (/^(END )?CHARMAP$/) {
328			#ignore
329		    } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) {
330			$used{hex($2)} .= "[${setname}{$1}]" unless $remove;
331			delete $used{hex($2)} if $remove;
332		    } else {
333			die("Syntax error in line $. in file '$setfile':\n'$_'\n");
334		    }
335		}
336		next;
337	    } elsif (/^STARTFONT /) {
338		# handle X11 BDF file
339		while (<$setf>) {
340		    if (/^ENCODING\s+([0-9]+)/) {
341			$used{$1} .= "[${setname}]" unless $remove;
342			delete $used{$1} if $remove;
343		    }
344		}
345		next;
346	    }
347	    tr/a-z/A-Z/;           # make input uppercase
348	    if ($cedf) {
349		if ($. > 4) {
350		    if (/^([0-9A-F]{2})\t.?\t(.*)$/) {
351			# handle Kosta Kosti's trans CEDF format
352			next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0));
353			$ucs = $invname{$2};
354			die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs;
355			$used{$ucs} .= "[${setname}{$1}]" unless $remove;
356			delete $used{$ucs} if $remove;
357		    } else {
358			die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n");
359		    }
360		}
361		next;
362	    }
363	    if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) {
364		# ignore ftp.unicode.org mapping file lines with #UNDEFINED
365		next;
366	    }
367	    s/^([^\#]*)\#.*$/$1/;  # remove comments
368	    next if (/^\032?\s*$/);     # skip empty lines
369	    if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
370		# handle entry from a ftp.unicode.org mapping file
371		$used{hex($4)} .= "[${setname}{$2}]" unless $remove;
372		delete $used{hex($4)} if $remove;
373	    } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
374		# handle interval specification
375		$first = hex($2);
376		$last = hex($5);
377		for ($i = $first; $i <= $last; $i++) {
378		    $used{$i} .= "[${setname}]" unless $remove;
379		    delete $used{$i} if $remove;
380		}
381	    } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) {
382		# handle lines from P10 MES draft
383		$row = $1;
384		$cols = $_;
385		$cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/;
386		$cols =~ tr/,//d;
387		@cols = split(/\s+/, $cols);
388		for (@cols) {
389		    if (/^(..)$/) {
390			$first = hex("$row$1");
391			$last  = $first;
392		    } elsif (/^(..)-(..)$/) {
393			$first = hex("$row$1");
394			$last  = hex("$row$2");
395		    } else {
396			die ("this should never happen '$_'");
397		    }
398		    for ($i = $first; $i <= $last; $i++) {
399			$used{$i} .= "[${setname}]" unless $remove;
400			delete $used{$i} if $remove;
401		    }
402		}
403	    } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) {
404		# handle single character
405		$used{hex($2)} .= "[${setname}]" unless $remove;
406		delete $used{hex($2)} if $remove;
407	    } else {
408		die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/;
409	    }
410	}
411	close $setf;
412    } elsif (/^loadimages$/ || /^loadbigimages$/) {
413	if (/^loadimages$/) {
414	    $prefix = "Small.Glyphs";
415	} else {
416	    $prefix = "Glyphs";
417	}
418	$total = 0;
419	for $i (keys(%used)) {
420	    next if ($name{$i} eq "<control>");
421	    $total++;
422	}
423	$count = 0;
424	$| = 1;
425	for $i (sort({$a <=> $b} keys(%used))) {
426	    next if ($name{$i} eq "<control>");
427	    $count++;
428	    $j = sprintf("%04X", $i);
429	    $j =~ /(..)(..)/;
430	    $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif";
431	    print("\r$count/$total: $gif");
432	    system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &");
433	    select(undef, undef, undef, 0.2);
434	}
435	print("\n");
436	exit 0;
437    } elsif (/^giftable/) {
438	# form a table of glyphs (requires pbmtools installed)
439	$count = 0;
440	for $i (keys(%used)) {
441	    $count++ unless $name{$i} eq "<control>";
442	}
443	$width = int(sqrt($count/sqrt(2)) + 0.5);
444	$width = $1 if /^giftable([0-9]+)$/;
445	system("rm -f tmp-*.pnm table.pnm~ table.pnm");
446	$col = 0;
447	$row = 0;
448	for $i (sort({$a <=> $b} keys(%used))) {
449	    next if ($name{$i} eq "<control>");
450	    $j = sprintf("%04X", $i);
451	    $j =~ /(..)(..)/;
452	    $gif = "Small.Glyphs/$1/U$j.gif";
453	    $pnm = sprintf("tmp-%02x.pnm", $col);
454	    $fallback = "Small.Glyphs/FF/UFFFD.gif";
455	    system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }");
456	    if (++$col == $width) {
457		system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
458		if ($row == 0) {
459		    system("mv tmp-row.pnm table.pnm");
460		} else {
461		    system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm");
462		}
463		$row++;
464		$col = 0;
465		system("rm -f tmp-*.pnm table.pnm~");
466	    }
467	}
468	if ($col > 0) {
469	    system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
470	    if ($row == 0) {
471		system("mv tmp-row.pnm table.pnm");
472	    } else {
473		system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm");
474	    }
475	}
476	system("rm -f table.gif ; ppmtogif table.pnm > table.gif");
477	system("rm -f tmp-*.pnm table.pnm~ table.pnm");
478    } elsif (/^table$/) {
479	# go through all used names to print full table
480	print "<TABLE border=2>\n" if $html;
481	for $i (sort({$a <=> $b} keys(%used))) {
482	    next if ($name{$i} eq "<control>");
483	    if ($html) {
484		$sources = $used{$i};
485		$sources =~ s/\]\[/, /g;
486		$sources =~ s/^\[//g;
487		$sources =~ s/\]$//g;
488		$sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g;
489		$j = sprintf("%04X", $i);
490		$j =~ /(..)(..)/;
491		$gif = "Small.Glyphs/$1/U$j.gif";
492		print "<TR>";
493		print "<TD><img width=32 height=32 src=\"$gif\">" if $image;
494		printf("<TD>&#%d;", $i) if $adducs;
495		print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i);
496		print " ($comment{$i})" if $comment{$i};
497		print "</SAMP><TD><SMALL>$sources</SMALL>\n";
498	    } else {
499		printf("%04X \# ", $i);
500		print pack("U", $i) . " " if $adducs;
501		print name($i) ."\n";
502	    }
503	}
504	print "</TABLE>\n" if $html;
505    } elsif (/^imgblock$/) {
506	$width = 16;
507	$width = $1 if /giftable([0-9]+)/;
508	$col = 0;
509	$subline = "";
510	print "\n<P><TABLE cellspacing=0 cellpadding=0>";
511	for $i (sort({$a <=> $b} keys(%used))) {
512	    print "<TR>" if $col == 0;
513	    $j = sprintf("%04X", $i);
514	    $j =~ /(..)(..)/;
515	    $gif = "Small.Glyphs/$1/U$j.gif";
516	    $alt = name($i);
517	    print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">";
518	    $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>";
519	    if (++$col == $width) {
520		print "<TR align=center>$subline";
521		$col = 0;
522		$subline = "";
523	    }
524	}
525	print "<TR align=center>$subline" if ($col > 0);
526	print "</TABLE>\n";
527    } elsif (/^sources$/) {
528	# count how many characters are attributed to the various source set combinations
529	print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html;
530	for $i (keys(%used)) {
531	    next if ($name{$i} eq "<control>");
532	    $sources = $used{$i};
533	    $sources =~ s/\]\[/, /g;
534	    $sources =~ s/^\[//g;
535	    $sources =~ s/\]$//g;
536	    $sources =~ s/\{(..)\}//g;
537	    $contribs{$sources} += 1;
538	}
539	for $j (keys(%contribs)) {
540	    print "<TR><TD>$contribs{$j}<TD>$j\n" if $html;
541	}
542	print "</TABLE>\n" if $html;
543    } elsif (/^compact$/) {
544	# print compact table in P10 MES format
545	print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html;
546	print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html;
547	print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html;
548	$current_row = '';
549	$start_col = '';
550	$last_col = '';
551	for $i (sort({$a <=> $b} keys(%used))) {
552	    next if ($name{$i} eq "<control>");
553	    $row = sprintf("%02X", $i >> 8);
554	    $col = sprintf("%02X", $i & 0xff);
555	    if ($row ne $current_row) {
556		if (($last_col ne '') and ($last_col ne $start_col)) {
557		    print "-$last_col";
558		    print "</SAMP>" if $html;
559		}
560		print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html;
561		print "\n  $row\t" unless $html;
562		$len = 0;
563		$current_row = $row;
564		$start_col = '';
565	    }
566	    if ($start_col eq '') {
567		print "$col";
568		$len += 2;
569		$start_col = $col;
570		$last_col = $col;
571	    } elsif (hex($col) == hex($last_col) + 1) {
572		$last_col = $col;
573	    } else {
574		if ($last_col ne $start_col) {
575		    print "-$last_col";
576		    $len += 3;
577		}
578		if ($len > 60 && !$html) {
579		    print "\n  $row\t";
580		    $len = 0;
581		};
582		print " " if $len;
583		print "$col";
584		$len += 2 + !! $len;
585		$start_col = $col;
586		$last_col = $col;
587	    }
588	}
589	if (($last_col ne '') and ($last_col ne $start_col)) {
590	    print "-$last_col";
591	    print "</SAMP>" if $html;
592	}
593	print "\n" if ($current_row ne '');
594	print "</TABLE>\n" if $html;
595	print "\n";
596    } elsif (/^c$/) {
597	# print table as C interval array
598	print "{";
599	$last_i = '';
600	$columns = 3;
601	$col = $columns;
602	for $i (sort({$a <=> $b} keys(%used))) {
603	    next if ($name{$i} eq "<control>");
604	    if ($last_i eq '') {
605		if (++$col > $columns) { $col = 1; print "\n "; }
606		printf(" { 0x%04X, ", $i);
607		$last_i = $i;
608	    } elsif ($i == $last_i + 1) {
609		$last_i = $i;
610	    } else {
611		printf("0x%04X },", $last_i);
612		if (++$col > $columns) { $col = 1; print "\n "; }
613		printf(" { 0x%04X, ", $i);
614		$last_i = $i;
615	    }
616	}
617	if ($last_i ne '') {
618	    printf("0x%04X }", $last_i);
619	}
620	print "\n};\n";
621    } elsif (/^utf8-list$/) {
622	$col = 0;
623	$block = 0;
624	$last = -1;
625	for $i (sort({$a <=> $b} keys(%used))) {
626	    next if ($name{$i} eq "<control>");
627	    while ($blockend[$block] < $i && $block < $blocks - 1) {
628		$block++;
629	    }
630	    if ($last <= $blockend[$block-1] &&
631		$i < $blockstart[$block]) {
632		print "\n" if ($col);
633		printf "\nFree block (U+%04X-U+%04X):\n\n",
634		    $blockend[$block-1] + 1, $blockstart[$block] - 1;
635		$col = 0;
636	    }
637	    if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
638		print "\n" if ($col);
639		printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n",
640		$blockstart[$block], $blockend[$block];
641		$col = 0;
642	    }
643	    if ($category{$i} eq 'Mn') {
644		# prefix non-spacing character with U+25CC DOTTED CIRCLE
645		print "\x{25CC}";
646	    } elsif ($category{$i} eq 'Me') {
647		# prefix enclosing non-spacing character with space
648		print " ";
649	    }
650	    print pack("U", $i);
651	    $col += 1 + iswide($i);
652	    if ($col >= 64) {
653		print "\n";
654		$col = 0;
655	    }
656	    $last = $i;
657	}
658	print "\n" if ($col);
659    } elsif (/^collections$/) {
660	$block = 0;
661	$last = -1;
662	for $i (sort({$a <=> $b} keys(%used))) {
663	    next if ($name{$i} eq "<control>");
664	    while ($blockend[$block] < $i && $block < $blocks - 1) {
665		$block++;
666	    }
667	    if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
668		print $blockname[$block],
669		  " " x (40 - length($blockname[$block]));
670		printf "%04X-%04X\n",
671		  $blockstart[$block], $blockend[$block];
672	    }
673	    $last = $i;
674	}
675    } elsif (/^nr$/) {
676	print "<P>" if $html;
677	print "# " unless $html;
678	print "Number of characters in above table: ";
679	$count = 0;
680	for $i (keys(%used)) {
681	    $count++ unless $name{$i} eq "<control>";
682	}
683	print $count;
684	print "\n";
685    } elsif (/^clean$/) {
686	# remove characters from set that are not in $unicodedata
687	for $i (keys(%used)) {
688	    delete $used{$i} unless is_unicode($i);
689	}
690    } elsif (/^unknown$/) {
691	# remove characters from set that are in $unicodedata
692	for $i (keys(%used)) {
693	    delete $used{$i} if is_unicode($i);
694	}
695    } else {
696	die("Unknown command line command '$_'\n");
697    };
698}
699