Lines Matching +full:0 +full:x00000000 +full:- +full:0 +full:x03ffffff

16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
30 /* Default names of the in- and output files. */
50 int verbose = 0;
63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
65 /* ------------------------------------------------------------------ */
80 #define UNICODE_MAJ_MAX ((unsigned short)-1)
81 #define UNICODE_MIN_MAX ((unsigned char)-1)
82 #define UNICODE_REV_MAX ((unsigned char)-1)
98 return 0; in age_valid()
100 return 0; in age_valid()
102 return 0; in age_valid()
106 /* ------------------------------------------------------------------ */
111 * A compact binary tree, used to decode UTF-8 characters.
116 * NEXTBYTE - flag - advance to next byte if set
117 * BITNUM - 3 bit field - the bit number to tested
118 * OFFLEN - 2 bit field - number of bytes in the offset
119 * if offlen == 0 (non-branching node)
120 * RIGHTPATH - 1 bit field - set if the following node is for the
121 * right-hand path (tested bit is set)
122 * TRIENODE - 1 bit field - set if the following node is an internal
124 * if offlen != 0 (branching node)
125 * LEFTNODE - 1 bit field - set if the left-hand node is internal
126 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
133 #define BITNUM 0x07
134 #define NEXTBYTE 0x08
135 #define OFFLEN 0x30
137 #define RIGHTPATH 0x40
138 #define TRIENODE 0x80
139 #define RIGHTNODE 0x40
140 #define LEFTNODE 0x80
148 * leaf[0]: The unicode version, stored as a generation number that is
151 * defined. The CCC of a non-defined code point is 0.
154 * with a non-zero CCC that occur between two characters with
155 * a CCC of 0, or at the begin or end of a string.
157 * between 0 and 254 inclusive, which leaves 255 available as
159 * Code points with CCC 0 are known as stoppers.
161 * start of a NUL-terminated string that is the decomposition
167 * These do affect normalization, as they all have CCC 0.
175 #define LEAF_GEN(LEAF) ((LEAF)[0])
181 #define MINCCC (0)
183 #define STOPPER (0)
200 /* ------------------------------------------------------------------ */
205 * The UTF-8 encoding spreads the bits of a 32bit word over several
209 * 0x00000000 0x0000007F: 0xxxxxxx
210 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
211 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
212 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
213 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
216 * There is an additional requirement on UTF-8, in that only the
221 * 0x00000000 0x0000007F: 0xxxxxxx
222 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
223 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
224 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
225 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
232 * 0 - 0x7f: 0 0x7f
233 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
234 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
235 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
238 * 0xd800 - 0xdfff should never be seen.
241 * the same a single UTF-32 character. This makes the UTF-8
242 * representation of Unicode strictly smaller than UTF-32.
245 * Corrigendum #1: UTF-8 Shortest Form
251 #define UTF8_2_BITS 0xC0
252 #define UTF8_3_BITS 0xE0
253 #define UTF8_4_BITS 0xF0
254 #define UTF8_N_BITS 0x80
255 #define UTF8_2_MASK 0xE0
256 #define UTF8_3_MASK 0xF0
257 #define UTF8_4_MASK 0xF8
258 #define UTF8_N_MASK 0xC0
259 #define UTF8_V_MASK 0x3F
266 if (val < 0x80) { in utf8encode()
267 str[0] = val; in utf8encode()
269 } else if (val < 0x800) { in utf8encode()
273 str[0] = val; in utf8encode()
274 str[0] |= UTF8_2_BITS; in utf8encode()
276 } else if (val < 0x10000) { in utf8encode()
283 str[0] = val; in utf8encode()
284 str[0] |= UTF8_3_BITS; in utf8encode()
286 } else if (val < 0x110000) { in utf8encode()
296 str[0] = val; in utf8encode()
297 str[0] |= UTF8_4_BITS; in utf8encode()
301 len = 0; in utf8encode()
309 unsigned int unichar = 0; in utf8decode()
311 if (*s < 0x80) { in utf8decode()
314 unichar = *s++ & 0x1F; in utf8decode()
316 unichar |= *s & 0x3F; in utf8decode()
318 unichar = *s++ & 0x0F; in utf8decode()
320 unichar |= *s++ & 0x3F; in utf8decode()
322 unichar |= *s & 0x3F; in utf8decode()
324 unichar = *s++ & 0x0F; in utf8decode()
326 unichar |= *s++ & 0x3F; in utf8decode()
328 unichar |= *s++ & 0x3F; in utf8decode()
330 unichar |= *s & 0x3F; in utf8decode()
337 return unichar < 0x110000; in utf32valid()
340 #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
343 #define LEAF 0
357 int leafindex[0x110000];
385 node = tree->root; in lookup()
387 if (node->nextbyte) in lookup()
389 if (*key & (1 << (node->bitnum & 7))) { in lookup()
391 if (node->rightnode == NODE) { in lookup()
392 node = node->right; in lookup()
393 } else if (node->rightnode == LEAF) { in lookup()
394 leaf = node->right; in lookup()
400 if (node->leftnode == NODE) { in lookup()
401 node = node->left; in lookup()
402 } else if (node->leftnode == LEAF) { in lookup()
403 leaf = node->left; in lookup()
414 * A simple non-recursive tree walker: keep track of visits to the
426 nodes = singletons = leaves = 0; in tree_walk()
428 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); in tree_walk()
429 if (tree->childnode == LEAF) { in tree_walk()
430 assert(tree->root); in tree_walk()
431 tree->leaf_print(tree->root, indent); in tree_walk()
434 assert(tree->childnode == NODE); in tree_walk()
435 node = tree->root; in tree_walk()
436 leftmask = rightmask = 0; in tree_walk()
441 node->bitnum, node->nextbyte, in tree_walk()
442 node->left, node->right, in tree_walk()
443 node->keymask, node->keybits); in tree_walk()
445 if (!(node->left && node->right)) in tree_walk()
449 bitmask = 1 << node->bitnum; in tree_walk()
450 if ((leftmask & bitmask) == 0) { in tree_walk()
452 if (node->leftnode == LEAF) { in tree_walk()
453 assert(node->left); in tree_walk()
454 tree->leaf_print(node->left, in tree_walk()
457 } else if (node->left) { in tree_walk()
458 assert(node->leftnode == NODE); in tree_walk()
460 node = node->left; in tree_walk()
464 if ((rightmask & bitmask) == 0) { in tree_walk()
466 if (node->rightnode == LEAF) { in tree_walk()
467 assert(node->right); in tree_walk()
468 tree->leaf_print(node->right, in tree_walk()
471 } else if (node->right) { in tree_walk()
472 assert(node->rightnode == NODE); in tree_walk()
474 node = node->right; in tree_walk()
480 node = node->parent; in tree_walk()
481 indent -= 1; in tree_walk()
498 node->left = node->right = NULL; in alloc_node()
499 node->parent = parent; in alloc_node()
500 node->leftnode = NODE; in alloc_node()
501 node->rightnode = NODE; in alloc_node()
502 node->keybits = 0; in alloc_node()
503 node->keymask = 0; in alloc_node()
504 node->mark = 0; in alloc_node()
505 node->index = 0; in alloc_node()
506 node->offset = -1; in alloc_node()
507 node->size = 4; in alloc_node()
509 if (node->parent) { in alloc_node()
510 bitnum = parent->bitnum; in alloc_node()
511 if ((bitnum & 7) == 0) { in alloc_node()
512 node->bitnum = bitnum + 7 + 8; in alloc_node()
513 node->nextbyte = 1; in alloc_node()
515 node->bitnum = bitnum - 1; in alloc_node()
516 node->nextbyte = 0; in alloc_node()
519 node->bitnum = 7; in alloc_node()
520 node->nextbyte = 0; in alloc_node()
543 cursor = &tree->root; in insert()
551 if (node->nextbyte) in insert()
553 if (*key & (1 << (node->bitnum & 7))) in insert()
554 cursor = &node->right; in insert()
556 cursor = &node->left; in insert()
557 keybits--; in insert()
563 if (*key & (1 << (node->bitnum & 7))) in insert()
564 node->rightnode = LEAF; in insert()
566 node->leftnode = LEAF; in insert()
567 if (node->nextbyte) in insert()
569 if (node->leftnode == NODE || node->rightnode == NODE) in insert()
571 assert(node->left); in insert()
572 assert(node->right); in insert()
574 if (! tree->leaf_equal(node->left, node->right)) in insert()
577 leaf = node->left; in insert()
579 parent = node->parent; in insert()
582 tree->root = leaf; in insert()
583 tree->childnode = LEAF; in insert()
584 } else if (parent->left == node) { in insert()
585 parent->left = leaf; in insert()
586 parent->leftnode = LEAF; in insert()
587 if (parent->right) { in insert()
588 parent->keymask = 0; in insert()
589 parent->keybits = 0; in insert()
591 parent->keymask |= (1 << node->bitnum); in insert()
593 } else if (parent->right == node) { in insert()
594 parent->right = leaf; in insert()
595 parent->rightnode = LEAF; in insert()
596 if (parent->left) { in insert()
597 parent->keymask = 0; in insert()
598 parent->keybits = 0; in insert()
600 parent->keymask |= (1 << node->bitnum); in insert()
601 parent->keybits |= (1 << node->bitnum); in insert()
605 assert(0); in insert()
613 parent = node->parent; in insert()
617 if (node->keymask == 0) { in insert()
618 parent->keymask = 0; in insert()
619 parent->keybits = 0; in insert()
620 } else if (parent->left && parent->right) { in insert()
621 parent->keymask = 0; in insert()
622 parent->keybits = 0; in insert()
624 assert((parent->keymask & node->keymask) == 0); in insert()
625 parent->keymask |= node->keymask; in insert()
626 parent->keymask |= (1 << parent->bitnum); in insert()
627 parent->keybits |= node->keybits; in insert()
628 if (parent->right) in insert()
629 parent->keybits |= (1 << parent->bitnum); in insert()
634 return 0; in insert()
667 if (verbose > 0) in prune()
668 printf("Pruning %s_%x\n", tree->type, tree->maxage); in prune()
670 count = 0; in prune()
671 if (tree->childnode == LEAF) in prune()
673 if (!tree->root) in prune()
676 leftmask = rightmask = 0; in prune()
677 node = tree->root; in prune()
679 if (node->nextbyte) in prune()
681 if (node->leftnode == LEAF) in prune()
683 if (node->rightnode == LEAF) in prune()
685 if (!node->left) in prune()
687 if (!node->right) in prune()
689 left = node->left; in prune()
690 right = node->right; in prune()
691 if (left->keymask == 0) in prune()
693 if (right->keymask == 0) in prune()
695 if (left->keymask != right->keymask) in prune()
697 if (left->keybits != right->keybits) in prune()
701 assert(left->left || left->right); in prune()
702 if (left->leftnode == LEAF) in prune()
703 leftleaf = left->left; in prune()
704 else if (left->rightnode == LEAF) in prune()
705 leftleaf = left->right; in prune()
706 else if (left->left) in prune()
707 left = left->left; in prune()
708 else if (left->right) in prune()
709 left = left->right; in prune()
711 assert(0); in prune()
715 assert(right->left || right->right); in prune()
716 if (right->leftnode == LEAF) in prune()
717 rightleaf = right->left; in prune()
718 else if (right->rightnode == LEAF) in prune()
719 rightleaf = right->right; in prune()
720 else if (right->left) in prune()
721 right = right->left; in prune()
722 else if (right->right) in prune()
723 right = right->right; in prune()
725 assert(0); in prune()
727 if (! tree->leaf_equal(leftleaf, rightleaf)) in prune()
730 * This node has identical singleton-only subtrees. in prune()
733 parent = node->parent; in prune()
734 left = node->left; in prune()
735 right = node->right; in prune()
736 if (parent->left == node) in prune()
737 parent->left = left; in prune()
738 else if (parent->right == node) in prune()
739 parent->right = left; in prune()
741 assert(0); in prune()
742 left->parent = parent; in prune()
743 left->keymask |= (1 << node->bitnum); in prune()
744 node->left = NULL; in prune()
746 bitmask = 1 << node->bitnum; in prune()
749 if (node->leftnode == NODE && node->left) { in prune()
750 left = node->left; in prune()
754 } else if (node->rightnode == NODE && node->right) { in prune()
755 right = node->right; in prune()
765 /* Force re-check */ in prune()
766 bitmask = 1 << node->bitnum; in prune()
770 if (node->left && node->right) in prune()
772 if (node->left) { in prune()
773 left = node->left; in prune()
774 node->keymask |= left->keymask; in prune()
775 node->keybits |= left->keybits; in prune()
777 if (node->right) { in prune()
778 right = node->right; in prune()
779 node->keymask |= right->keymask; in prune()
780 node->keybits |= right->keybits; in prune()
782 node->keymask |= (1 << node->bitnum); in prune()
783 node = node->parent; in prune()
784 /* Force re-check */ in prune()
785 bitmask = 1 << node->bitnum; in prune()
790 bitmask = 1 << node->bitnum; in prune()
791 if ((leftmask & bitmask) == 0 && in prune()
792 node->leftnode == NODE && in prune()
793 node->left) { in prune()
795 node = node->left; in prune()
796 } else if ((rightmask & bitmask) == 0 && in prune()
797 node->rightnode == NODE && in prune()
798 node->right) { in prune()
800 node = node->right; in prune()
804 node = node->parent; in prune()
807 if (verbose > 0) in prune()
824 marked = 0; in mark_nodes()
825 if (verbose > 0) in mark_nodes()
826 printf("Marking %s_%x\n", tree->type, tree->maxage); in mark_nodes()
827 if (tree->childnode == LEAF) in mark_nodes()
830 assert(tree->childnode == NODE); in mark_nodes()
831 node = tree->root; in mark_nodes()
832 leftmask = rightmask = 0; in mark_nodes()
834 bitmask = 1 << node->bitnum; in mark_nodes()
835 if ((leftmask & bitmask) == 0) { in mark_nodes()
837 if (node->leftnode == LEAF) { in mark_nodes()
838 assert(node->left); in mark_nodes()
839 if (tree->leaf_mark(node->left)) { in mark_nodes()
841 while (n && !n->mark) { in mark_nodes()
843 n->mark = 1; in mark_nodes()
844 n = n->parent; in mark_nodes()
847 } else if (node->left) { in mark_nodes()
848 assert(node->leftnode == NODE); in mark_nodes()
849 node = node->left; in mark_nodes()
853 if ((rightmask & bitmask) == 0) { in mark_nodes()
855 if (node->rightnode == LEAF) { in mark_nodes()
856 assert(node->right); in mark_nodes()
857 if (tree->leaf_mark(node->right)) { in mark_nodes()
859 while (n && !n->mark) { in mark_nodes()
861 n->mark = 1; in mark_nodes()
862 n = n->parent; in mark_nodes()
865 } else if (node->right) { in mark_nodes()
866 assert(node->rightnode == NODE); in mark_nodes()
867 node = node->right; in mark_nodes()
873 node = node->parent; in mark_nodes()
878 assert(tree->childnode == NODE); in mark_nodes()
879 node = tree->root; in mark_nodes()
880 leftmask = rightmask = 0; in mark_nodes()
882 bitmask = 1 << node->bitnum; in mark_nodes()
883 if ((leftmask & bitmask) == 0) { in mark_nodes()
885 if (node->leftnode == LEAF) { in mark_nodes()
886 assert(node->left); in mark_nodes()
887 if (tree->leaf_mark(node->left)) { in mark_nodes()
889 while (n && !n->mark) { in mark_nodes()
891 n->mark = 1; in mark_nodes()
892 n = n->parent; in mark_nodes()
895 } else if (node->left) { in mark_nodes()
896 assert(node->leftnode == NODE); in mark_nodes()
897 node = node->left; in mark_nodes()
898 if (!node->mark && node->parent->mark) { in mark_nodes()
900 node->mark = 1; in mark_nodes()
905 if ((rightmask & bitmask) == 0) { in mark_nodes()
907 if (node->rightnode == LEAF) { in mark_nodes()
908 assert(node->right); in mark_nodes()
909 if (tree->leaf_mark(node->right)) { in mark_nodes()
911 while (n && !n->mark) { in mark_nodes()
913 n->mark = 1; in mark_nodes()
914 n = n->parent; in mark_nodes()
917 } else if (node->right) { in mark_nodes()
918 assert(node->rightnode == NODE); in mark_nodes()
919 node = node->right; in mark_nodes()
920 if (!node->mark && node->parent->mark && in mark_nodes()
921 !node->parent->left) { in mark_nodes()
923 node->mark = 1; in mark_nodes()
930 node = node->parent; in mark_nodes()
933 if (verbose > 0) in mark_nodes()
939 * emitted trie. These values must be pre-computed because relative
954 tree->index = index; in index_nodes()
956 count = 0; in index_nodes()
958 if (verbose > 0) in index_nodes()
959 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); in index_nodes()
960 if (tree->childnode == LEAF) { in index_nodes()
961 index += tree->leaf_size(tree->root); in index_nodes()
965 assert(tree->childnode == NODE); in index_nodes()
966 node = tree->root; in index_nodes()
967 leftmask = rightmask = 0; in index_nodes()
969 if (!node->mark) in index_nodes()
972 if (node->index != index) in index_nodes()
973 node->index = index; in index_nodes()
974 index += node->size; in index_nodes()
977 bitmask = 1 << node->bitnum; in index_nodes()
978 if (node->mark && (leftmask & bitmask) == 0) { in index_nodes()
980 if (node->leftnode == LEAF) { in index_nodes()
981 assert(node->left); in index_nodes()
982 *tree->leaf_index(tree, node->left) = in index_nodes()
984 index += tree->leaf_size(node->left); in index_nodes()
986 } else if (node->left) { in index_nodes()
987 assert(node->leftnode == NODE); in index_nodes()
989 node = node->left; in index_nodes()
993 if (node->mark && (rightmask & bitmask) == 0) { in index_nodes()
995 if (node->rightnode == LEAF) { in index_nodes()
996 assert(node->right); in index_nodes()
997 *tree->leaf_index(tree, node->right) = index; in index_nodes()
998 index += tree->leaf_size(node->right); in index_nodes()
1000 } else if (node->right) { in index_nodes()
1001 assert(node->rightnode == NODE); in index_nodes()
1003 node = node->right; in index_nodes()
1009 node = node->parent; in index_nodes()
1010 indent -= 1; in index_nodes()
1017 if (verbose > 0) in index_nodes()
1029 if (!node || node->mark) in mark_subtree()
1030 return 0; in mark_subtree()
1031 node->mark = 1; in mark_subtree()
1032 node->index = node->parent->index; in mark_subtree()
1034 if (node->leftnode == NODE) in mark_subtree()
1035 changed += mark_subtree(node->left); in mark_subtree()
1036 if (node->rightnode == NODE) in mark_subtree()
1037 changed += mark_subtree(node->right); in mark_subtree()
1043 * each node needs to store a three-byte offset. The indexes of the
1066 changed = 0; in size_nodes()
1067 size = 0; in size_nodes()
1069 if (verbose > 0) in size_nodes()
1070 printf("Sizing %s_%x\n", tree->type, tree->maxage); in size_nodes()
1071 if (tree->childnode == LEAF) in size_nodes()
1074 assert(tree->childnode == NODE); in size_nodes()
1075 pathbits = 0; in size_nodes()
1076 pathmask = 0; in size_nodes()
1077 node = tree->root; in size_nodes()
1078 leftmask = rightmask = 0; in size_nodes()
1080 if (!node->mark) in size_nodes()
1082 offset = 0; in size_nodes()
1083 if (!node->left || !node->right) { in size_nodes()
1086 if (node->rightnode == NODE) { in size_nodes()
1093 right = node->right; in size_nodes()
1094 next = tree->next; in size_nodes()
1095 while (!right->mark) { in size_nodes()
1097 n = next->root; in size_nodes()
1098 while (n->bitnum != node->bitnum) { in size_nodes()
1099 nbit = 1 << n->bitnum; in size_nodes()
1103 if (n->rightnode == LEAF) in size_nodes()
1105 n = n->right; in size_nodes()
1107 if (n->leftnode == LEAF) in size_nodes()
1109 n = n->left; in size_nodes()
1112 if (n->bitnum != node->bitnum) in size_nodes()
1114 n = n->right; in size_nodes()
1116 next = next->next; in size_nodes()
1119 if (!right->mark) in size_nodes()
1121 offset = right->index - node->index; in size_nodes()
1123 offset = *tree->leaf_index(tree, node->right); in size_nodes()
1124 offset -= node->index; in size_nodes()
1126 assert(offset >= 0); in size_nodes()
1127 assert(offset <= 0xffffff); in size_nodes()
1128 if (offset <= 0xff) { in size_nodes()
1130 } else if (offset <= 0xffff) { in size_nodes()
1132 } else { /* offset <= 0xffffff */ in size_nodes()
1136 if (node->size != size || node->offset != offset) { in size_nodes()
1137 node->size = size; in size_nodes()
1138 node->offset = offset; in size_nodes()
1143 bitmask = 1 << node->bitnum; in size_nodes()
1145 if (node->mark && (leftmask & bitmask) == 0) { in size_nodes()
1147 if (node->leftnode == LEAF) { in size_nodes()
1148 assert(node->left); in size_nodes()
1149 } else if (node->left) { in size_nodes()
1150 assert(node->leftnode == NODE); in size_nodes()
1152 node = node->left; in size_nodes()
1156 if (node->mark && (rightmask & bitmask) == 0) { in size_nodes()
1159 if (node->rightnode == LEAF) { in size_nodes()
1160 assert(node->right); in size_nodes()
1161 } else if (node->right) { in size_nodes()
1162 assert(node->rightnode == NODE); in size_nodes()
1164 node = node->right; in size_nodes()
1172 node = node->parent; in size_nodes()
1173 indent -= 1; in size_nodes()
1177 if (verbose > 0) in size_nodes()
1201 nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; in emit()
1202 leaves = 0; in emit()
1203 bytes = 0; in emit()
1204 index = tree->index; in emit()
1207 if (verbose > 0) in emit()
1208 printf("Emitting %s_%x\n", tree->type, tree->maxage); in emit()
1209 if (tree->childnode == LEAF) { in emit()
1210 assert(tree->root); in emit()
1211 tree->leaf_emit(tree->root, data); in emit()
1212 size = tree->leaf_size(tree->root); in emit()
1218 assert(tree->childnode == NODE); in emit()
1219 node = tree->root; in emit()
1220 leftmask = rightmask = 0; in emit()
1222 if (!node->mark) in emit()
1224 assert(node->offset != -1); in emit()
1225 assert(node->index == index); in emit()
1227 byte = 0; in emit()
1228 if (node->nextbyte) in emit()
1230 byte |= (node->bitnum & BITNUM); in emit()
1231 if (node->left && node->right) { in emit()
1232 if (node->leftnode == NODE) in emit()
1234 if (node->rightnode == NODE) in emit()
1236 if (node->offset <= 0xff) in emit()
1238 else if (node->offset <= 0xffff) in emit()
1243 offset = node->offset; in emit()
1247 while (offlen--) { in emit()
1248 *data++ = offset & 0xff; in emit()
1252 } else if (node->left) { in emit()
1253 if (node->leftnode == NODE) in emit()
1255 nodes[0]++; in emit()
1258 } else if (node->right) { in emit()
1260 if (node->rightnode == NODE) in emit()
1262 nodes[0]++; in emit()
1266 assert(0); in emit()
1270 bitmask = 1 << node->bitnum; in emit()
1271 if (node->mark && (leftmask & bitmask) == 0) { in emit()
1273 if (node->leftnode == LEAF) { in emit()
1274 assert(node->left); in emit()
1275 data = tree->leaf_emit(node->left, in emit()
1277 size = tree->leaf_size(node->left); in emit()
1281 } else if (node->left) { in emit()
1282 assert(node->leftnode == NODE); in emit()
1284 node = node->left; in emit()
1288 if (node->mark && (rightmask & bitmask) == 0) { in emit()
1290 if (node->rightnode == LEAF) { in emit()
1291 assert(node->right); in emit()
1292 data = tree->leaf_emit(node->right, in emit()
1294 size = tree->leaf_size(node->right); in emit()
1298 } else if (node->right) { in emit()
1299 assert(node->rightnode == NODE); in emit()
1301 node = node->right; in emit()
1307 node = node->parent; in emit()
1308 indent -= 1; in emit()
1312 if (verbose > 0) { in emit()
1316 nodes[0] + nodes[1] + nodes[2] + nodes[3], in emit()
1317 nodes[0], nodes[1], nodes[2], nodes[3]); in emit()
1318 printf(" %d total\n", index - tree->index); in emit()
1322 /* ------------------------------------------------------------------ */
1350 struct unicode_data unicode_data[0x110000];
1368 for (i = 0; i != corrections_count; i++) in corrections_lookup()
1369 if (u->code == corrections[i].code) in corrections_lookup()
1379 if (left->gen != right->gen) in nfdi_equal()
1380 return 0; in nfdi_equal()
1381 if (left->ccc != right->ccc) in nfdi_equal()
1382 return 0; in nfdi_equal()
1383 if (left->utf8nfdi && right->utf8nfdi && in nfdi_equal()
1384 strcmp(left->utf8nfdi, right->utf8nfdi) == 0) in nfdi_equal()
1386 if (left->utf8nfdi || right->utf8nfdi) in nfdi_equal()
1387 return 0; in nfdi_equal()
1396 if (left->gen != right->gen) in nfdicf_equal()
1397 return 0; in nfdicf_equal()
1398 if (left->ccc != right->ccc) in nfdicf_equal()
1399 return 0; in nfdicf_equal()
1400 if (left->utf8nfdicf && right->utf8nfdicf && in nfdicf_equal()
1401 strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) in nfdicf_equal()
1403 if (left->utf8nfdicf && right->utf8nfdicf) in nfdicf_equal()
1404 return 0; in nfdicf_equal()
1405 if (left->utf8nfdicf || right->utf8nfdicf) in nfdicf_equal()
1406 return 0; in nfdicf_equal()
1407 if (left->utf8nfdi && right->utf8nfdi && in nfdicf_equal()
1408 strcmp(left->utf8nfdi, right->utf8nfdi) == 0) in nfdicf_equal()
1410 if (left->utf8nfdi || right->utf8nfdi) in nfdicf_equal()
1411 return 0; in nfdicf_equal()
1420 leaf->code, leaf->ccc, leaf->gen); in nfdi_print()
1422 if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) in nfdi_print()
1424 else if (leaf->utf8nfdi) in nfdi_print()
1425 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); in nfdi_print()
1435 leaf->code, leaf->ccc, leaf->gen); in nfdicf_print()
1437 if (leaf->utf8nfdicf) in nfdicf_print()
1438 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); in nfdicf_print()
1439 else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) in nfdicf_print()
1441 else if (leaf->utf8nfdi) in nfdicf_print()
1442 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); in nfdicf_print()
1455 if (leaf->utf8nfdicf) in nfdicf_mark()
1457 return 0; in nfdicf_mark()
1464 return leaf->correction; in correction_mark()
1472 if (HANGUL_SYLLABLE(leaf->code)) in nfdi_size()
1474 else if (leaf->utf8nfdi) in nfdi_size()
1475 size += strlen(leaf->utf8nfdi) + 1; in nfdi_size()
1484 if (HANGUL_SYLLABLE(leaf->code)) in nfdicf_size()
1486 else if (leaf->utf8nfdicf) in nfdicf_size()
1487 size += strlen(leaf->utf8nfdicf) + 1; in nfdicf_size()
1488 else if (leaf->utf8nfdi) in nfdicf_size()
1489 size += strlen(leaf->utf8nfdi) + 1; in nfdicf_size()
1497 return &tree->leafindex[leaf->code]; in nfdi_index()
1504 return &tree->leafindex[leaf->code]; in nfdicf_index()
1512 *data++ = leaf->gen; in nfdi_emit()
1514 if (HANGUL_SYLLABLE(leaf->code)) { in nfdi_emit()
1517 } else if (leaf->utf8nfdi) { in nfdi_emit()
1519 s = (unsigned char*)leaf->utf8nfdi; in nfdi_emit()
1520 while ((*data++ = *s++) != 0) in nfdi_emit()
1523 *data++ = leaf->ccc; in nfdi_emit()
1533 *data++ = leaf->gen; in nfdicf_emit()
1535 if (HANGUL_SYLLABLE(leaf->code)) { in nfdicf_emit()
1538 } else if (leaf->utf8nfdicf) { in nfdicf_emit()
1540 s = (unsigned char*)leaf->utf8nfdicf; in nfdicf_emit()
1541 while ((*data++ = *s++) != 0) in nfdicf_emit()
1543 } else if (leaf->utf8nfdi) { in nfdicf_emit()
1545 s = (unsigned char*)leaf->utf8nfdi; in nfdicf_emit()
1546 while ((*data++ = *s++) != 0) in nfdicf_emit()
1549 *data++ = leaf->ccc; in nfdicf_emit()
1561 if (data->utf8nfdi) { in utf8_create()
1562 assert(data->utf8nfdi[0] == HANGUL); in utf8_create()
1567 um = data->utf32nfdi; in utf8_create()
1569 for (i = 0; um[i]; i++) in utf8_create()
1571 *u = '\0'; in utf8_create()
1572 data->utf8nfdi = strdup(utf); in utf8_create()
1575 um = data->utf32nfdicf; in utf8_create()
1577 for (i = 0; um[i]; i++) in utf8_create()
1579 *u = '\0'; in utf8_create()
1580 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) in utf8_create()
1581 data->utf8nfdicf = strdup(utf); in utf8_create()
1590 for (unichar = 0; unichar != 0x110000; unichar++) in utf8_init()
1593 for (i = 0; i != corrections_count; i++) in utf8_init()
1607 count = 0; in trees_init()
1608 nextage = (unsigned int)-1; in trees_init()
1611 nextage = 0; in trees_init()
1612 for (i = 0; i <= corrections_count; i++) { in trees_init()
1614 if (nextage < data->correction && in trees_init()
1615 data->correction < maxage) in trees_init()
1616 nextage = data->correction; in trees_init()
1627 nextage = (unsigned int)-1; in trees_init()
1630 trees[--count].maxage = maxage; in trees_init()
1631 trees[--count].maxage = maxage; in trees_init()
1632 nextage = 0; in trees_init()
1633 for (i = 0; i <= corrections_count; i++) { in trees_init()
1635 if (nextage < data->correction && in trees_init()
1636 data->correction < maxage) in trees_init()
1637 nextage = data->correction; in trees_init()
1642 for (i = 0; i != trees_count; i++) { in trees_init()
1643 j = 0; in trees_init()
1646 trees[i].maxage = ages[j-1]; in trees_init()
1650 trees[trees_count-2].next = &trees[trees_count-1]; in trees_init()
1651 trees[trees_count-1].leaf_mark = nfdi_mark; in trees_init()
1652 trees[trees_count-2].leaf_mark = nfdicf_mark; in trees_init()
1653 for (i = 0; i != trees_count-2; i += 2) { in trees_init()
1654 trees[i].next = &trees[trees_count-2]; in trees_init()
1656 trees[i+1].next = &trees[trees_count-1]; in trees_init()
1661 for (i = 0; i != trees_count; i += 2) { in trees_init()
1678 for (i = 0; i != trees_count; i++) in trees_init()
1690 for (i = 0; i != trees_count; i++) { in trees_populate()
1691 if (verbose > 0) { in trees_populate()
1695 for (unichar = 0; unichar != 0x110000; unichar++) { in trees_populate()
1696 if (unicode_data[unichar].gen < 0) in trees_populate()
1700 if (data->correction <= trees[i].maxage) in trees_populate()
1713 for (i = 0; i != trees_count; i++) in trees_reduce()
1715 for (i = 0; i != trees_count; i++) in trees_reduce()
1718 size = 0; in trees_reduce()
1719 for (i = 0; i != trees_count; i++) in trees_reduce()
1721 changed = 0; in trees_reduce()
1722 for (i = 0; i != trees_count; i++) in trees_reduce()
1728 for (i = 0; i != trees_count; i++) in trees_reduce()
1731 if (verbose > 0) { in trees_reduce()
1732 for (i = 0; i != trees_count; i++) { in trees_reduce()
1738 nfdi = utf8data + trees[trees_count-1].index; in trees_reduce()
1739 nfdicf = utf8data + trees[trees_count-2].index; in trees_reduce()
1741 nfdi_tree = &trees[trees_count-1]; in trees_reduce()
1742 nfdicf_tree = &trees[trees_count-2]; in trees_reduce()
1755 if (verbose > 0) in verify()
1756 printf("Verifying %s_%x\n", tree->type, tree->maxage); in verify()
1757 nocf = strcmp(tree->type, "nfdicf"); in verify()
1759 for (unichar = 0; unichar != 0x110000; unichar++) { in verify()
1760 report = 0; in verify()
1762 if (data->correction <= tree->maxage) in verify()
1768 if (data->gen != -1) in verify()
1770 if (unichar < 0xd800 || unichar > 0xdfff) in verify()
1773 if (unichar >= 0xd800 && unichar <= 0xdfff) in verify()
1775 if (data->gen == -1) in verify()
1777 if (data->gen != LEAF_GEN(leaf)) in verify()
1780 if (HANGUL_SYLLABLE(data->code)) { in verify()
1781 if (data->utf8nfdi[0] != HANGUL) in verify()
1784 if (!data->utf8nfdi) { in verify()
1786 } else if (strcmp(data->utf8nfdi, in verify()
1791 if (!data->utf8nfdicf && in verify()
1792 !data->utf8nfdi) { in verify()
1794 } else if (data->utf8nfdicf) { in verify()
1795 if (strcmp(data->utf8nfdicf, in verify()
1798 } else if (strcmp(data->utf8nfdi, in verify()
1803 } else if (data->ccc != LEAF_CCC(leaf)) { in verify()
1809 " nfdi -> \"%s\"", in verify()
1810 unichar, data->code, data->gen, in verify()
1811 data->ccc, in verify()
1812 data->utf8nfdi); in verify()
1815 " nfdi -> \"%s\"", in verify()
1830 for (i = 0; i != trees_count; i++) in trees_verify()
1834 /* ------------------------------------------------------------------ */
1841 printf("normalization of UTF-8 strings. The trie is derived from\n"); in help()
1848 printf("\t- Apply unicode normalization form NFD.\n"); in help()
1849 printf("\t- Remove any Default_Ignorable_Code_Point.\n"); in help()
1852 printf("\t- Apply unicode normalization form NFD.\n"); in help()
1853 printf("\t- Remove any Default_Ignorable_Code_Point.\n"); in help()
1854 printf("\t- Apply a full casefold (C + F).\n"); in help()
1863 printf("by version 11.0.0 of the Unicode Character Database.\n"); in help()
1866 printf("\t-a %s\n", AGE_NAME); in help()
1867 printf("\t-c %s\n", CCC_NAME); in help()
1868 printf("\t-p %s\n", PROP_NAME); in help()
1869 printf("\t-d %s\n", DATA_NAME); in help()
1870 printf("\t-f %s\n", FOLD_NAME); in help()
1871 printf("\t-n %s\n", NORM_NAME); in help()
1874 printf("\t-t %s\n", TEST_NAME); in help()
1877 printf("\t-o %s\n", UTF8_NAME); in help()
1905 /* ------------------------------------------------------------------ */
1911 for (i = 0; utf32str[i]; i++) in print_utf32()
1917 printf(" %X ->", unichar); in print_utf32nfdi()
1924 printf(" %X ->", unichar); in print_utf32nfdicf()
1929 /* ------------------------------------------------------------------ */
1944 if (verbose > 0) in age_init()
1950 count = 0; in age_init()
1952 gen = 0; in age_init()
1970 if (!age_valid(major, minor, 0)) in age_init()
1979 if (ages_count == 0 || ages_count > MAXGEN) in age_init()
1982 /* There is a 0 entry. */ in age_init()
1986 ages[ages_count] = (unsigned int)-1; in age_init()
1989 count = 0; in age_init()
1990 gen = 0; in age_init()
2006 ages[++gen] = UNICODE_AGE(major, minor, 0); in age_init()
2010 if (!age_valid(major, minor, 0)) in age_init()
2019 count += 1 + last - first; in age_init()
2043 for (unichar = 0xd800; unichar <= 0xdfff; unichar++) in age_init()
2044 unicode_data[unichar].gen = -1; in age_init()
2046 if (verbose > 0) in age_init()
2048 if (count == 0) in age_init()
2062 if (verbose > 0) in ccc_init()
2069 count = 0; in ccc_init()
2096 if (verbose > 0) in ccc_init()
2098 if (count == 0) in ccc_init()
2110 for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) in ignore_compatibility_form()
2111 if (strcmp(type, ignored_types[i]) == 0) in ignore_compatibility_form()
2113 return 0; in ignore_compatibility_form()
2120 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfdi_init()
2128 if (verbose > 0) in nfdi_init()
2134 count = 0; in nfdi_init()
2148 *s++ = '\0'; in nfdi_init()
2152 /* decode the decomposition into UTF-32 */ in nfdi_init()
2153 i = 0; in nfdi_init()
2160 mapping[i++] = 0; in nfdi_init()
2171 if (verbose > 0) in nfdi_init()
2173 if (count == 0) in nfdi_init()
2181 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfdicf_init()
2189 if (verbose > 0) in nfdicf_init()
2195 count = 0; in nfdicf_init()
2209 i = 0; in nfdicf_init()
2216 mapping[i++] = 0; in nfdicf_init()
2227 if (verbose > 0) in nfdicf_init()
2229 if (count == 0) in nfdicf_init()
2243 if (verbose > 0) in ignore_init()
2249 count = 0; in ignore_init()
2260 *um = 0; in ignore_init()
2264 *um = 0; in ignore_init()
2281 *um = 0; in ignore_init()
2285 *um = 0; in ignore_init()
2296 if (verbose > 0) in ignore_init()
2298 if (count == 0) in ignore_init()
2311 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in corrections_init()
2317 if (verbose > 0) in corrections_init()
2323 count = 0; in corrections_init()
2338 count = 0; in corrections_init()
2352 i = 0; in corrections_init()
2360 mapping[i++] = 0; in corrections_init()
2367 printf(" %X -> %s -> %s V%d_%d_%d\n", in corrections_init()
2373 if (verbose > 0) in corrections_init()
2375 if (count == 0) in corrections_init()
2379 /* ------------------------------------------------------------------ */
2382 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2384 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2387 * SBase = 0xAC00
2388 * LBase = 0x1100
2389 * VBase = 0x1161
2390 * TBase = 0x11A7
2398 * SIndex = s - SBase
2418 * if (TIndex == 0) {
2429 unsigned int sb = 0xAC00; in hangul_decompose()
2430 unsigned int lb = 0x1100; in hangul_decompose()
2431 unsigned int vb = 0x1161; in hangul_decompose()
2432 unsigned int tb = 0x11a7; in hangul_decompose()
2444 if (verbose > 0) in hangul_decompose()
2447 count = 0; in hangul_decompose()
2448 for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { in hangul_decompose()
2449 unsigned int si = unichar - sb; in hangul_decompose()
2454 i = 0; in hangul_decompose()
2459 mapping[i++] = 0; in hangul_decompose()
2477 unicode_data[unichar].utf8nfdi[0] = HANGUL; in hangul_decompose()
2478 unicode_data[unichar].utf8nfdi[1] = '\0'; in hangul_decompose()
2485 if (verbose > 0) in hangul_decompose()
2492 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfdi_decompose()
2500 if (verbose > 0) in nfdi_decompose()
2503 count = 0; in nfdi_decompose()
2504 for (unichar = 0; unichar != 0x110000; unichar++) { in nfdi_decompose()
2509 i = 0; in nfdi_decompose()
2514 for (j = 0; dc[j]; j++) in nfdi_decompose()
2516 ret = 0; in nfdi_decompose()
2522 mapping[i++] = 0; in nfdi_decompose()
2540 if (verbose > 0) in nfdi_decompose()
2547 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfdicf_decompose()
2555 if (verbose > 0) in nfdicf_decompose()
2557 count = 0; in nfdicf_decompose()
2558 for (unichar = 0; unichar != 0x110000; unichar++) { in nfdicf_decompose()
2563 i = 0; in nfdicf_decompose()
2568 for (j = 0; dc[j]; j++) in nfdicf_decompose()
2570 ret = 0; in nfdicf_decompose()
2576 mapping[i++] = 0; in nfdicf_decompose()
2588 if (verbose > 0) in nfdicf_decompose()
2592 /* ------------------------------------------------------------------ */
2606 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2608 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2611 * SBase = 0xAC00
2612 * LBase = 0x1100
2613 * VBase = 0x1161
2614 * TBase = 0x11A7
2622 * SIndex = s - SBase
2642 * if (TIndex == 0) {
2651 #define SB (0xAC00)
2652 #define LB (0x1100)
2653 #define VB (0x1161)
2654 #define TB (0x11A7)
2671 si = utf8decode(str) - SB; in utf8hangul()
2682 /* Add LPart, a 3-byte UTF-8 sequence. */ in utf8hangul()
2685 /* Add VPart, a 3-byte UTF-8 sequence. */ in utf8hangul()
2688 /* Add TPart if required, also a 3-byte UTF-8 sequence. */ in utf8hangul()
2693 h[0] = '\0'; in utf8hangul()
2702 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703 * is well-formed and corresponds to a known unicode code point. The
2704 * shorthand for this will be "is valid UTF-8 unicode".
2717 if (len == 0) in utf8nlookup()
2720 trie = utf8data + tree->index; in utf8nlookup()
2724 if (--len == 0) in utf8nlookup()
2735 while (--offlen) { in utf8nlookup()
2766 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is in utf8nlookup()
2768 * start of the sequence is at s-2. in utf8nlookup()
2770 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) in utf8nlookup()
2771 trie = utf8hangul(s - 2, hangul); in utf8nlookup()
2784 return utf8nlookup(tree, hangul, s, (size_t)-1); in utf8lookup()
2788 * Return the number of bytes used by the current UTF-8 sequence.
2789 * Assumes the input points to the first byte of a valid UTF-8
2795 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); in utf8clen()
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if only non-assigned code points are used.
2806 int age = 0; in utf8agemax()
2811 return -1; in utf8agemax()
2816 return -1; in utf8agemax()
2818 if (leaf_age <= tree->maxage && leaf_age > age) in utf8agemax()
2827 * Return -1 if s is not valid UTF-8 unicode.
2828 * Return 0 if non-assigned code points are used.
2838 return -1; in utf8agemin()
2839 age = tree->maxage; in utf8agemin()
2843 return -1; in utf8agemin()
2845 if (leaf_age <= tree->maxage && leaf_age < age) in utf8agemin()
2854 * Return -1 if s is not valid UTF-8 unicode.
2859 int age = 0; in utf8nagemax()
2864 return -1; in utf8nagemax()
2869 return -1; in utf8nagemax()
2871 if (leaf_age <= tree->maxage && leaf_age > age) in utf8nagemax()
2873 len -= utf8clen(s); in utf8nagemax()
2881 * Return -1 if s is not valid UTF-8 unicode.
2891 return -1; in utf8nagemin()
2892 age = tree->maxage; in utf8nagemin()
2896 return -1; in utf8nagemin()
2898 if (leaf_age <= tree->maxage && leaf_age < age) in utf8nagemin()
2900 len -= utf8clen(s); in utf8nagemin()
2908 * Return -1 if s is not valid UTF-8 unicode.
2910 * A string of Default_Ignorable_Code_Point has length 0.
2915 size_t ret = 0; in utf8len()
2919 return -1; in utf8len()
2923 return -1; in utf8len()
2924 if (ages[LEAF_GEN(leaf)] > tree->maxage) in utf8len()
2937 * Return -1 if s is not valid UTF-8 unicode.
2942 size_t ret = 0; in utf8nlen()
2946 return -1; in utf8nlen()
2950 return -1; in utf8nlen()
2951 if (ages[LEAF_GEN(leaf)] > tree->maxage) in utf8nlen()
2957 len -= utf8clen(s); in utf8nlen()
2988 * Returns -1 on error, 0 on success.
2994 return -1; in utf8ncursor()
2996 return -1; in utf8ncursor()
2997 u8c->tree = tree; in utf8ncursor()
2998 u8c->s = s; in utf8ncursor()
2999 u8c->p = NULL; in utf8ncursor()
3000 u8c->ss = NULL; in utf8ncursor()
3001 u8c->sp = NULL; in utf8ncursor()
3002 u8c->len = len; in utf8ncursor()
3003 u8c->slen = 0; in utf8ncursor()
3004 u8c->ccc = STOPPER; in utf8ncursor()
3005 u8c->nccc = STOPPER; in utf8ncursor()
3006 u8c->unichar = 0; in utf8ncursor()
3008 if (u8c->len != len) in utf8ncursor()
3009 return -1; in utf8ncursor()
3011 if (len > 0 && (*s & 0xC0) == 0x80) in utf8ncursor()
3012 return -1; in utf8ncursor()
3013 return 0; in utf8ncursor()
3019 * s : NUL-terminated string.
3023 * Returns -1 on error, 0 on success.
3027 return utf8ncursor(u8c, tree, s, (unsigned int)-1); in utf8cursor()
3033 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3035 * The cursor keeps track of the location in the string in u8c->s.
3037 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038 * that bytes from a decomposition do not count against u8c->len.
3040 * Characters are emitted if they match the current CCC in u8c->ccc.
3041 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042 * and the function returns 0 in that case.
3045 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3047 * emitted and stores it in u8c->nccc, the second pass emits the
3053 * u8c->p != NULL -> a decomposition is being scanned.
3054 * u8c->ss != NULL -> this is a repeating scan.
3055 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3064 if (u8c->p && *u8c->s == '\0') { in utf8byte()
3065 u8c->s = u8c->p; in utf8byte()
3066 u8c->p = NULL; in utf8byte()
3069 /* Check for end-of-string. */ in utf8byte()
3070 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { in utf8byte()
3072 if (u8c->ccc == STOPPER) in utf8byte()
3073 return 0; in utf8byte()
3074 /* End-of-string during a scan counts as a stopper. */ in utf8byte()
3077 } else if ((*u8c->s & 0xC0) == 0x80) { in utf8byte()
3079 if (!u8c->p) in utf8byte()
3080 u8c->len--; in utf8byte()
3081 return (unsigned char)*u8c->s++; in utf8byte()
3085 if (u8c->p) { in utf8byte()
3086 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); in utf8byte()
3088 leaf = utf8nlookup(u8c->tree, u8c->hangul, in utf8byte()
3089 u8c->s, u8c->len); in utf8byte()
3094 return -1; in utf8byte()
3096 /* Characters that are too new have CCC 0. */ in utf8byte()
3097 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { in utf8byte()
3100 u8c->len -= utf8clen(u8c->s); in utf8byte()
3101 u8c->p = u8c->s + utf8clen(u8c->s); in utf8byte()
3102 u8c->s = LEAF_STR(leaf); in utf8byte()
3103 /* Empty decomposition implies CCC 0. */ in utf8byte()
3104 if (*u8c->s == '\0') { in utf8byte()
3105 if (u8c->ccc == STOPPER) in utf8byte()
3110 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); in utf8byte()
3113 u8c->unichar = utf8decode(u8c->s); in utf8byte()
3119 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) in utf8byte()
3120 u8c->nccc = ccc; in utf8byte()
3126 if (ccc == u8c->ccc) { in utf8byte()
3127 if (!u8c->p) in utf8byte()
3128 u8c->len--; in utf8byte()
3129 return (unsigned char)*u8c->s++; in utf8byte()
3134 if (u8c->nccc == STOPPER) { in utf8byte()
3140 assert(u8c->ccc == STOPPER); in utf8byte()
3141 u8c->ccc = MINCCC - 1; in utf8byte()
3142 u8c->nccc = ccc; in utf8byte()
3143 u8c->sp = u8c->p; in utf8byte()
3144 u8c->ss = u8c->s; in utf8byte()
3145 u8c->slen = u8c->len; in utf8byte()
3146 if (!u8c->p) in utf8byte()
3147 u8c->len -= utf8clen(u8c->s); in utf8byte()
3148 u8c->s += utf8clen(u8c->s); in utf8byte()
3151 if (!u8c->p) in utf8byte()
3152 u8c->len -= utf8clen(u8c->s); in utf8byte()
3153 u8c->s += utf8clen(u8c->s); in utf8byte()
3154 } else if (u8c->nccc != MAXCCC + 1) { in utf8byte()
3156 u8c->ccc = u8c->nccc; in utf8byte()
3157 u8c->nccc = MAXCCC + 1; in utf8byte()
3158 u8c->s = u8c->ss; in utf8byte()
3159 u8c->p = u8c->sp; in utf8byte()
3160 u8c->len = u8c->slen; in utf8byte()
3163 u8c->ccc = STOPPER; in utf8byte()
3164 u8c->nccc = STOPPER; in utf8byte()
3165 u8c->sp = NULL; in utf8byte()
3166 u8c->ss = NULL; in utf8byte()
3167 u8c->slen = 0; in utf8byte()
3172 /* ------------------------------------------------------------------ */
3181 /* First test: null-terminated string. */ in normalize_line()
3185 return -1; in normalize_line()
3186 while ((c = utf8byte(&u8c)) > 0) in normalize_line()
3188 return -1; in normalize_line()
3189 if (c < 0) in normalize_line()
3190 return -1; in normalize_line()
3191 if (*t != 0) in normalize_line()
3192 return -1; in normalize_line()
3194 /* Second test: length-limited string. */ in normalize_line()
3197 s[strlen(s) + 1] = -1; in normalize_line()
3200 return -1; in normalize_line()
3201 while ((c = utf8byte(&u8c)) > 0) in normalize_line()
3203 return -1; in normalize_line()
3204 if (c < 0) in normalize_line()
3205 return -1; in normalize_line()
3206 if (*t != 0) in normalize_line()
3207 return -1; in normalize_line()
3209 return 0; in normalize_line()
3221 int tests = 0; in normalization_test()
3222 int failures = 0; in normalization_test()
3224 if (verbose > 0) in normalization_test()
3242 *t = '\0'; in normalization_test()
3244 ignorables = 0; in normalization_test()
3250 if (data->utf8nfdi && !*data->utf8nfdi) in normalization_test()
3255 *t = '\0'; in normalization_test()
3258 if (normalize_line(nfdi_tree) < 0) { in normalization_test()
3259 printf("Line %s -> %s", buf0, buf1); in normalization_test()
3267 if (verbose > 0) in normalization_test()
3273 /* ------------------------------------------------------------------ */
3283 if (verbose > 0) in write_file()
3296 for (i = 0; i != ages_count; i++) in write_file()
3302 t = 0; in write_file()
3303 for (gen = 0; gen < ages_count; gen++) { in write_file()
3314 for (gen = 0; gen < ages_count; gen++) { in write_file()
3325 t = 0; in write_file()
3326 for (i = 0; i != utf8data_size; i += 16) { in write_file()
3330 if (t < trees_count-1) in write_file()
3335 fprintf(file, "0x%.2x%s", utf8data[j], in write_file()
3336 (j < utf8data_size -1 ? "," : "")); in write_file()
3359 /* ------------------------------------------------------------------ */
3366 argv0 = argv[0]; in main()
3368 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { in main()
3399 exit(0); in main()
3407 for (unichar = 0; unichar != 0x110000; unichar++) in main()
3432 return 0; in main()