1#!/usr/bin/perl 2# Copyright (c) 2018 Markus Kuhn 3# Uniset -- Unicode subset manager -- Markus Kuhn 4# http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz 5 6require 5.014; 7use open ':utf8'; 8use FindBin qw($RealBin); # to find directory where this file is located 9 10binmode(STDOUT, ":utf8"); 11binmode(STDIN, ":utf8"); 12 13my (%name, %invname, %category, %comment); 14 15print <<End if $#ARGV < 0; 16Uniset -- Unicode subset manager -- Markus Kuhn 17 18Uniset merges and subtracts Unicode subsets. It can output and 19analyse the resulting character set in various formats. 20 21Uniset understand the following command-line arguments: 22 23Commands to define a set of characters: 24 25 + filename add the character set described in the file to the set 26 - filename remove the character set described in the file from the set 27 +: filename add the characters in the UTF-8 file to the set 28 -: filename remove the characters in the UTF-8 file from the set 29 +xxxx..yyyy add the range to the set (xxxx and yyyy are hex numbers) 30 -xxxx..yyyy remove the range from the set (xxxx and yyyy are hex numbers) 31 +cat=Xx add all Unicode characters with category code Xx 32 -cat=Xx remove all Unicode characters with category code Xx 33 -cat!=Xx remove all Unicode characters without category code Xx 34 clean remove any elements that do not appear in the Unicode database 35 unknown remove any elements that do appear in the Unicode database 36 37Command to output descriptions of the constructed set of characters: 38 39 table write a full table with one line per character 40 compact output the set in compact MES format 41 c output the set as C interval array 42 nr output the number of characters 43 sources output a table that shows the number of characters contributed 44 by the various combinations of input sets added with +. 45 utf8-list output a list of all characters encoded in UTF-8 46 47Commands to tailor the following output commands: 48 49 html write HTML tables instead of plain text 50 ucs add the unicode character itself to the table (UTF-8 in 51 plain table, numeric character reference in HTML) 52 53Formats of character set input files read by the + and - command: 54 55Empty lines, white space at the start and end of the line and any 56comment text following a \# are ignored. The following formats are 57recognized 58 59xx yyyy xx is the hex code in an 8-bit character set and yyyy 60 is the corresponding Unicode value. Both can optionally 61 be prefixed by 0x. This is the format used in the 62 files on <ftp://ftp.unicode.org/Public/MAPPINGS/>. 63 64yyyy yyyy (optionally prefixed with 0x) is a Unicode character 65 belonging to the specified subset. 66 67yyyy-yyyy a range of Unicode characters belonging to 68yyyy..yyyy the specified subset. 69 70xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify 71 corresponding low bytes or with a hyphen also ranges of 72 low bytes in the Unicode values that belong to this 73 subset. This is also the format that is generated by 74 the compact command. 75End 76exit 1 if $#ARGV < 0; 77 78 79# Subroutine to identify whether the ISO 10646/Unicode character code 80# ucs belongs into the East Asian Wide (W) or East Asian FullWidth 81# (F) category as defined in Unicode Technical Report #11. 82 83sub iswide ($) { 84 my $ucs = shift(@_); 85 86 return ($ucs >= 0x1100 && 87 ($ucs <= 0x115f || # Hangul Jamo 88 $ucs == 0x2329 || $ucs == 0x232a || 89 ($ucs >= 0x2e80 && $ucs <= 0xa4cf && 90 $ucs != 0x303f) || # CJK .. Yi 91 ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables 92 ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs 93 ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms 94 ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms 95 ($ucs >= 0xffe0 && $ucs <= 0xffe6) || 96 ($ucs >= 0x20000 && $ucs <= 0x2fffd) || 97 ($ucs >= 0x30000 && $ucs <= 0x3fffd))); 98} 99 100# Return the Unicode name that belongs to a given character code 101 102# Jamo short names, see Unicode 3.0, table 4-4, page 86 103 104my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 105 'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112 106my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 107 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 108 'EU', 'YI', 'I'); # 1161..1175 109my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', 110 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 111 'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2 112 113sub name { 114 my $ucs = shift(@_); 115 116 # The intervals used here reflect Unicode Version 3.2 117 if (($ucs >= 0x3400 && $ucs <= 0x4db5) || 118 ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || 119 ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { 120 return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs); 121 } 122 123 if ($ucs >= 0xac00 && $ucs <= 0xd7a3) { 124 my $s = $ucs - 0xac00; 125 my $l = 0x1100 + int($s / (21 * 28)); 126 my $v = 0x1161 + int(($s % (21 * 28)) / 28); 127 my $t = 0x11a7 + $s % 28; 128 return "HANGUL SYLLABLE " . 129 ($lname[int($s / (21 * 28))] . 130 $vname[int(($s % (21 * 28)) / 28)] . 131 $tname[$s % 28 - 1]); 132 } 133 134 return $name{$ucs}; 135} 136 137sub is_unicode { 138 my $ucs = shift(@_); 139 140 # The intervals used here reflect Unicode Version 3.2 141 if (($ucs >= 0x3400 && $ucs <= 0x4db5) || 142 ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || 143 ($ucs >= 0xac00 && $ucs <= 0xd7a3) || 144 ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { 145 return 1; 146 } 147 148 return exists $name{$ucs}; 149} 150 151my @search_path = (); 152if ($RealBin =~ m|^(.*)/bin\z| && -d "$1/share/uniset") { 153 push @search_path, "$1/share/uniset"; 154} else { 155 push @search_path, $RealBin; 156} 157 158sub search_open { 159 my ($mode, $fn) = @_; 160 my $file; 161 return $file if open($file, $mode, $fn); 162 return undef if $fn =~ m|/|; 163 for my $path (@search_path) { 164 return $file if open($file, $mode, "$path/$fn"); 165 } 166 return undef; 167} 168 169my $html = 0; 170my $image = 0; 171my $adducs = 0; 172my $unicodedata = "UnicodeData.txt"; 173my $blockdata = "Blocks.txt"; 174 175# read list of all Unicode names 176my $data = search_open('<', $unicodedata); 177unless ($data) { 178 die ("Can't open Unicode database '$unicodedata':\n$!\n\n" . 179 "Please make sure that you have downloaded the file\n" . 180 "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"); 181} 182while (<$data>) { 183 if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) { 184 next if $2 ne '<control>' && substr($2, 0, 1) eq '<'; 185 $ucs = hex($1); 186 $name{$ucs} = $2; 187 $invname{$2} = $ucs; 188 $category{$ucs} = $3; 189 $comment{$ucs} = $12; 190 } else { 191 die("Syntax error in line '$_' in file '$unicodedata'\n"); 192 } 193} 194close($data); 195 196# read list of all Unicode blocks 197$data = search_open('<', $blockdata); 198unless ($data) { 199 die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" . 200 "Please make sure that you have downloaded the file\n" . 201 "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n"); 202} 203my $blocks = 0; 204my (@blockstart, @blockend, @blockname); 205while (<$data>) { 206 if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) { 207 $blockstart[$blocks] = hex($1); 208 $blockend [$blocks] = hex($2); 209 $blockname [$blocks] = $3; 210 $blocks++; 211 } elsif (/^\s*\#/ || /^\s*$/) { 212 # ignore comments and empty lines 213 } else { 214 die("Syntax error in line '$_' in file '$blockdata'\n"); 215 } 216} 217close($data); 218if ($blockend[$blocks-1] < 0x110000) { 219 $blockstart[$blocks] = 0x110000; 220 $blockend [$blocks] = 0x7FFFFFFF; 221 $blockname [$blocks] = "Beyond Plane 16"; 222 $blocks++; 223} 224 225# process command line arguments 226while ($_ = shift(@ARGV)) { 227 if (/^html$/) { 228 $html = 1; 229 } elsif (/^ucs$/) { 230 $adducs = 1; 231 } elsif (/^img$/) { 232 $html = 1; 233 $image = 1; 234 } elsif (/^template$/) { 235 $template = shift(@ARGV); 236 open(TEMPLATE, $template) || die("Can't open template file '$template': $!\n"); 237 while (<TEMPLATE>) { 238 if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) { 239 open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n"); 240 while (<INCLUDE>) { 241 print $_; 242 } 243 close(INCLUDE); 244 } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) { 245 open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n"); 246 while (<INCLUDE>) { 247 s/&/&/g; 248 s/</</g; 249 print $_; 250 } 251 close(INCLUDE); 252 } else { 253 print $_; 254 } 255 } 256 close(TEMPLATE); 257 } elsif (/^\+cat=(.+)$/) { 258 # add characters with given category 259 $cat = $1; 260 for $i (keys(%category)) { 261 $used{$i} = "[${cat}]" if $category{$i} eq $cat; 262 } 263 } elsif (/^\-cat=(.+)$/) { 264 # remove characters with given category 265 $cat = $1; 266 for $i (keys(%category)) { 267 delete $used{$i} if $category{$i} eq $cat; 268 } 269 } elsif (/^\-cat!=(.+)$/) { 270 # remove characters without given category 271 $cat = $1; 272 for $i (keys(%category)) { 273 delete $used{$i} unless $category{$i} eq $cat; 274 } 275 } elsif (/^([+-]):(.*)/) { 276 $remove = $1 eq "-"; 277 $setfile = $2; 278 $setfile = shift(@ARGV) if $setfile eq ""; 279 push(@SETS, $setfile); 280 open(SET, $setfile) || die("Can't open set file '$setfile': $!\n"); 281 $setname = $setfile; 282 while (<SET>) { 283 while ($_) { 284 $i = ord($_); 285 $used{$i} .= "[${setname}]" unless $remove; 286 delete $used{$i} if $remove; 287 $_ = substr($_, 1); 288 } 289 } 290 close SET; 291 } elsif (/^([+-])(.*)/) { 292 $remove = $1 eq "-"; 293 $setfile = $2; 294 $setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/; 295 if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) { 296 # handle intervall specification on command line 297 $first = hex($1); 298 $last = hex($3); 299 for ($i = $first; $i <= $last; $i++) { 300 $used{$i} .= "[ARG]" unless $remove; 301 delete $used{$i} if $remove; 302 } 303 next; 304 } 305 $setfile = shift(@ARGV) if $setfile eq ""; 306 push(@SETS, $setfile); 307 my $setf = search_open('<', $setfile); 308 die("Can't open set file '$setfile': $!\n") unless $setf; 309 $cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name 310 $setname = $setfile; 311 $setname =~ s/([^.\[\]]*)\..*/$1/; 312 while (<$setf>) { 313 if (/^<code_set_name>/) { 314 # handle ISO 15897 (POSIX registry) charset mapping format 315 undef $comment_char; 316 undef $escape_char; 317 while (<$setf>) { 318 if ($comment_char && /^$comment_char/) { 319 # remove comments 320 $_ = $`; 321 } 322 next if (/^\032?\s*$/); # skip empty lines 323 if (/^<comment_char> (\S)$/) { 324 $comment_char = $1; 325 } elsif (/^<escape_char> (\S)$/) { 326 $escape_char = $1; 327 } elsif (/^(END )?CHARMAP$/) { 328 #ignore 329 } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) { 330 $used{hex($2)} .= "[${setname}{$1}]" unless $remove; 331 delete $used{hex($2)} if $remove; 332 } else { 333 die("Syntax error in line $. in file '$setfile':\n'$_'\n"); 334 } 335 } 336 next; 337 } elsif (/^STARTFONT /) { 338 # handle X11 BDF file 339 while (<$setf>) { 340 if (/^ENCODING\s+([0-9]+)/) { 341 $used{$1} .= "[${setname}]" unless $remove; 342 delete $used{$1} if $remove; 343 } 344 } 345 next; 346 } 347 tr/a-z/A-Z/; # make input uppercase 348 if ($cedf) { 349 if ($. > 4) { 350 if (/^([0-9A-F]{2})\t.?\t(.*)$/) { 351 # handle Kosta Kosti's trans CEDF format 352 next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0)); 353 $ucs = $invname{$2}; 354 die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs; 355 $used{$ucs} .= "[${setname}{$1}]" unless $remove; 356 delete $used{$ucs} if $remove; 357 } else { 358 die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n"); 359 } 360 } 361 next; 362 } 363 if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) { 364 # ignore ftp.unicode.org mapping file lines with #UNDEFINED 365 next; 366 } 367 s/^([^\#]*)\#.*$/$1/; # remove comments 368 next if (/^\032?\s*$/); # skip empty lines 369 if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) { 370 # handle entry from a ftp.unicode.org mapping file 371 $used{hex($4)} .= "[${setname}{$2}]" unless $remove; 372 delete $used{hex($4)} if $remove; 373 } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) { 374 # handle interval specification 375 $first = hex($2); 376 $last = hex($5); 377 for ($i = $first; $i <= $last; $i++) { 378 $used{$i} .= "[${setname}]" unless $remove; 379 delete $used{$i} if $remove; 380 } 381 } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) { 382 # handle lines from P10 MES draft 383 $row = $1; 384 $cols = $_; 385 $cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/; 386 $cols =~ tr/,//d; 387 @cols = split(/\s+/, $cols); 388 for (@cols) { 389 if (/^(..)$/) { 390 $first = hex("$row$1"); 391 $last = $first; 392 } elsif (/^(..)-(..)$/) { 393 $first = hex("$row$1"); 394 $last = hex("$row$2"); 395 } else { 396 die ("this should never happen '$_'"); 397 } 398 for ($i = $first; $i <= $last; $i++) { 399 $used{$i} .= "[${setname}]" unless $remove; 400 delete $used{$i} if $remove; 401 } 402 } 403 } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) { 404 # handle single character 405 $used{hex($2)} .= "[${setname}]" unless $remove; 406 delete $used{hex($2)} if $remove; 407 } else { 408 die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/; 409 } 410 } 411 close $setf; 412 } elsif (/^loadimages$/ || /^loadbigimages$/) { 413 if (/^loadimages$/) { 414 $prefix = "Small.Glyphs"; 415 } else { 416 $prefix = "Glyphs"; 417 } 418 $total = 0; 419 for $i (keys(%used)) { 420 next if ($name{$i} eq "<control>"); 421 $total++; 422 } 423 $count = 0; 424 $| = 1; 425 for $i (sort({$a <=> $b} keys(%used))) { 426 next if ($name{$i} eq "<control>"); 427 $count++; 428 $j = sprintf("%04X", $i); 429 $j =~ /(..)(..)/; 430 $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif"; 431 print("\r$count/$total: $gif"); 432 system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &"); 433 select(undef, undef, undef, 0.2); 434 } 435 print("\n"); 436 exit 0; 437 } elsif (/^giftable/) { 438 # form a table of glyphs (requires pbmtools installed) 439 $count = 0; 440 for $i (keys(%used)) { 441 $count++ unless $name{$i} eq "<control>"; 442 } 443 $width = int(sqrt($count/sqrt(2)) + 0.5); 444 $width = $1 if /^giftable([0-9]+)$/; 445 system("rm -f tmp-*.pnm table.pnm~ table.pnm"); 446 $col = 0; 447 $row = 0; 448 for $i (sort({$a <=> $b} keys(%used))) { 449 next if ($name{$i} eq "<control>"); 450 $j = sprintf("%04X", $i); 451 $j =~ /(..)(..)/; 452 $gif = "Small.Glyphs/$1/U$j.gif"; 453 $pnm = sprintf("tmp-%02x.pnm", $col); 454 $fallback = "Small.Glyphs/FF/UFFFD.gif"; 455 system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }"); 456 if (++$col == $width) { 457 system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm"); 458 if ($row == 0) { 459 system("mv tmp-row.pnm table.pnm"); 460 } else { 461 system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm"); 462 } 463 $row++; 464 $col = 0; 465 system("rm -f tmp-*.pnm table.pnm~"); 466 } 467 } 468 if ($col > 0) { 469 system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm"); 470 if ($row == 0) { 471 system("mv tmp-row.pnm table.pnm"); 472 } else { 473 system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm"); 474 } 475 } 476 system("rm -f table.gif ; ppmtogif table.pnm > table.gif"); 477 system("rm -f tmp-*.pnm table.pnm~ table.pnm"); 478 } elsif (/^table$/) { 479 # go through all used names to print full table 480 print "<TABLE border=2>\n" if $html; 481 for $i (sort({$a <=> $b} keys(%used))) { 482 next if ($name{$i} eq "<control>"); 483 if ($html) { 484 $sources = $used{$i}; 485 $sources =~ s/\]\[/, /g; 486 $sources =~ s/^\[//g; 487 $sources =~ s/\]$//g; 488 $sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g; 489 $j = sprintf("%04X", $i); 490 $j =~ /(..)(..)/; 491 $gif = "Small.Glyphs/$1/U$j.gif"; 492 print "<TR>"; 493 print "<TD><img width=32 height=32 src=\"$gif\">" if $image; 494 printf("<TD>&#%d;", $i) if $adducs; 495 print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i); 496 print " ($comment{$i})" if $comment{$i}; 497 print "</SAMP><TD><SMALL>$sources</SMALL>\n"; 498 } else { 499 printf("%04X \# ", $i); 500 print pack("U", $i) . " " if $adducs; 501 print name($i) ."\n"; 502 } 503 } 504 print "</TABLE>\n" if $html; 505 } elsif (/^imgblock$/) { 506 $width = 16; 507 $width = $1 if /giftable([0-9]+)/; 508 $col = 0; 509 $subline = ""; 510 print "\n<P><TABLE cellspacing=0 cellpadding=0>"; 511 for $i (sort({$a <=> $b} keys(%used))) { 512 print "<TR>" if $col == 0; 513 $j = sprintf("%04X", $i); 514 $j =~ /(..)(..)/; 515 $gif = "Small.Glyphs/$1/U$j.gif"; 516 $alt = name($i); 517 print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">"; 518 $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>"; 519 if (++$col == $width) { 520 print "<TR align=center>$subline"; 521 $col = 0; 522 $subline = ""; 523 } 524 } 525 print "<TR align=center>$subline" if ($col > 0); 526 print "</TABLE>\n"; 527 } elsif (/^sources$/) { 528 # count how many characters are attributed to the various source set combinations 529 print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html; 530 for $i (keys(%used)) { 531 next if ($name{$i} eq "<control>"); 532 $sources = $used{$i}; 533 $sources =~ s/\]\[/, /g; 534 $sources =~ s/^\[//g; 535 $sources =~ s/\]$//g; 536 $sources =~ s/\{(..)\}//g; 537 $contribs{$sources} += 1; 538 } 539 for $j (keys(%contribs)) { 540 print "<TR><TD>$contribs{$j}<TD>$j\n" if $html; 541 } 542 print "</TABLE>\n" if $html; 543 } elsif (/^compact$/) { 544 # print compact table in P10 MES format 545 print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html; 546 print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html; 547 print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html; 548 $current_row = ''; 549 $start_col = ''; 550 $last_col = ''; 551 for $i (sort({$a <=> $b} keys(%used))) { 552 next if ($name{$i} eq "<control>"); 553 $row = sprintf("%02X", $i >> 8); 554 $col = sprintf("%02X", $i & 0xff); 555 if ($row ne $current_row) { 556 if (($last_col ne '') and ($last_col ne $start_col)) { 557 print "-$last_col"; 558 print "</SAMP>" if $html; 559 } 560 print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html; 561 print "\n $row\t" unless $html; 562 $len = 0; 563 $current_row = $row; 564 $start_col = ''; 565 } 566 if ($start_col eq '') { 567 print "$col"; 568 $len += 2; 569 $start_col = $col; 570 $last_col = $col; 571 } elsif (hex($col) == hex($last_col) + 1) { 572 $last_col = $col; 573 } else { 574 if ($last_col ne $start_col) { 575 print "-$last_col"; 576 $len += 3; 577 } 578 if ($len > 60 && !$html) { 579 print "\n $row\t"; 580 $len = 0; 581 }; 582 print " " if $len; 583 print "$col"; 584 $len += 2 + !! $len; 585 $start_col = $col; 586 $last_col = $col; 587 } 588 } 589 if (($last_col ne '') and ($last_col ne $start_col)) { 590 print "-$last_col"; 591 print "</SAMP>" if $html; 592 } 593 print "\n" if ($current_row ne ''); 594 print "</TABLE>\n" if $html; 595 print "\n"; 596 } elsif (/^c$/) { 597 # print table as C interval array 598 print "{"; 599 $last_i = ''; 600 $columns = 3; 601 $col = $columns; 602 for $i (sort({$a <=> $b} keys(%used))) { 603 next if ($name{$i} eq "<control>"); 604 if ($last_i eq '') { 605 if (++$col > $columns) { $col = 1; print "\n "; } 606 printf(" { 0x%04X, ", $i); 607 $last_i = $i; 608 } elsif ($i == $last_i + 1) { 609 $last_i = $i; 610 } else { 611 printf("0x%04X },", $last_i); 612 if (++$col > $columns) { $col = 1; print "\n "; } 613 printf(" { 0x%04X, ", $i); 614 $last_i = $i; 615 } 616 } 617 if ($last_i ne '') { 618 printf("0x%04X }", $last_i); 619 } 620 print "\n};\n"; 621 } elsif (/^utf8-list$/) { 622 $col = 0; 623 $block = 0; 624 $last = -1; 625 for $i (sort({$a <=> $b} keys(%used))) { 626 next if ($name{$i} eq "<control>"); 627 while ($blockend[$block] < $i && $block < $blocks - 1) { 628 $block++; 629 } 630 if ($last <= $blockend[$block-1] && 631 $i < $blockstart[$block]) { 632 print "\n" if ($col); 633 printf "\nFree block (U+%04X-U+%04X):\n\n", 634 $blockend[$block-1] + 1, $blockstart[$block] - 1; 635 $col = 0; 636 } 637 if ($last < $blockstart[$block] && $i >= $blockstart[$block]) { 638 print "\n" if ($col); 639 printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n", 640 $blockstart[$block], $blockend[$block]; 641 $col = 0; 642 } 643 if ($category{$i} eq 'Mn') { 644 # prefix non-spacing character with U+25CC DOTTED CIRCLE 645 print "\x{25CC}"; 646 } elsif ($category{$i} eq 'Me') { 647 # prefix enclosing non-spacing character with space 648 print " "; 649 } 650 print pack("U", $i); 651 $col += 1 + iswide($i); 652 if ($col >= 64) { 653 print "\n"; 654 $col = 0; 655 } 656 $last = $i; 657 } 658 print "\n" if ($col); 659 } elsif (/^collections$/) { 660 $block = 0; 661 $last = -1; 662 for $i (sort({$a <=> $b} keys(%used))) { 663 next if ($name{$i} eq "<control>"); 664 while ($blockend[$block] < $i && $block < $blocks - 1) { 665 $block++; 666 } 667 if ($last < $blockstart[$block] && $i >= $blockstart[$block]) { 668 print $blockname[$block], 669 " " x (40 - length($blockname[$block])); 670 printf "%04X-%04X\n", 671 $blockstart[$block], $blockend[$block]; 672 } 673 $last = $i; 674 } 675 } elsif (/^nr$/) { 676 print "<P>" if $html; 677 print "# " unless $html; 678 print "Number of characters in above table: "; 679 $count = 0; 680 for $i (keys(%used)) { 681 $count++ unless $name{$i} eq "<control>"; 682 } 683 print $count; 684 print "\n"; 685 } elsif (/^clean$/) { 686 # remove characters from set that are not in $unicodedata 687 for $i (keys(%used)) { 688 delete $used{$i} unless is_unicode($i); 689 } 690 } elsif (/^unknown$/) { 691 # remove characters from set that are in $unicodedata 692 for $i (keys(%used)) { 693 delete $used{$i} if is_unicode($i); 694 } 695 } else { 696 die("Unknown command line command '$_'\n"); 697 }; 698} 699