1 /*
2 * Copyright (C) 2002 by Red Hat, Incorporated. All rights reserved.
3 *
4 * Permission to use, copy, modify, and distribute this software
5 * is freely granted, provided that this notice is preserved.
6 *
7 * Tests gleaned from Markus Kuhn's UTF-8 and Unicode FAQ,
8 * and specifically, his UTF-8-test.txt decoder stress test file.
9 */
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <locale.h>
14
15 #define MAX_BYTES 65
16
17 int num_invalid(const char *s, int len);
18
19 char first[6][6] = {
20 {0x0}, /* U-00000000 */
21 {0xc2, 0x80}, /* U-00000080 */
22 {0xe0, 0xa0, 0x80}, /* U-00000800 */
23 {0xf0, 0x90, 0x80, 0x80}, /* U-00010000 */
24 {0xf8, 0x88, 0x80, 0x80, 0x80}, /* U-00200000 */
25 {0xfc, 0x84, 0x80, 0x80, 0x80, 0x80} /* U-04000000 */
26 };
27
28 char last[6][6] = {
29 {0x7f}, /* U-0000007F */
30 {0xdf, 0xbf}, /* U-000007FF */
31 {0xef, 0xbf, 0xbf}, /* U-0000FFFF */
32 {0xf7, 0xbf, 0xbf, 0xbf}, /* U-001FFFFF */
33 {0xfb, 0xbf, 0xbf, 0xbf, 0xbf}, /* U-03FFFFFF */
34 {0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf} /* U-7FFFFFFF */
35 };
36
37 char boundary[5][6] = {
38 {0xed, 0x9f, 0xbf}, /* U-0000D7FF */
39 {0xee, 0x80, 0x80}, /* U-0000E000 */
40 {0xef, 0xbf, 0xbd}, /* U-0000FFFD */
41 {0xf4, 0x8f, 0xbf, 0xbf}, /* U-0010FFFF */
42 {0xf4, 0x90, 0x80, 0x80} /* U-00110000 */
43 };
44
45 char continuation_bytes[8][7] = {
46 {0x80},
47 {0xbf},
48 {0x80, 0xbf},
49 {0x80, 0xbf, 0x80},
50 {0x80, 0xbf, 0x80, 0xbf},
51 {0x80, 0xbf, 0x80, 0xbf, 0x80},
52 {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf},
53 {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0x80}
54 };
55
56 char all_continuation_bytes[64];
57
58
59 char all_two_byte_seq[32];
60 char all_three_byte_seq[16];
61 char all_four_byte_seq[8];
62 char all_five_byte_seq[4];
63 char all_six_byte_seq[2];
64
65 char incomplete_seq[10][6] = {
66 {0xc2}, /* U-00000080 */
67 {0xe0, 0x80}, /* U-00000800 */
68 {0xf0, 0x80, 0x80}, /* U-00010000 */
69 {0xf8, 0x80, 0x80, 0x80}, /* U-00200000 */
70 {0xfc, 0x80, 0x80, 0x80, 0x80}, /* U-04000000 */
71 {0xdf}, /* U-000007FF */
72 {0xef, 0xbf}, /* U-0000FFFF */
73 {0xf7, 0xbf, 0xbf}, /* U-001FFFFF */
74 {0xfb, 0xbf, 0xbf, 0xbf}, /* U-03FFFFFF */
75 {0xfd, 0xbf, 0xbf, 0xbf, 0xbf} /* U-7FFFFFFF */
76 };
77
78 char incomplete_seq_concat[30];
79
80 char impossible_bytes[3][4] = {
81 {0xfe},
82 {0xff},
83 {0xfe, 0xfe, 0xff, 0xff}
84 };
85
86 char overlong[5][6] = {
87 {0xc0, 0xaf},
88 {0xe0, 0x80, 0xaf},
89 {0xf0, 0x80, 0x80, 0xaf},
90 {0xf8, 0x80, 0x80, 0x80, 0xaf},
91 {0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf}
92 };
93
94 char overlong_max[5][6] = {
95 {0xc1, 0xbf},
96 {0xe0, 0x9f, 0xbf},
97 {0xf0, 0x8f, 0xbf, 0xbf},
98 {0xf8, 0x87, 0xbf, 0xbf, 0xbf},
99 {0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf}
100 };
101
102 char overlong_nul[5][6] = {
103 {0xc0, 0x80},
104 {0xe0, 0x80, 0x80},
105 {0xf0, 0x80, 0x80, 0x80},
106 {0xf8, 0x80, 0x80, 0x80, 0x80},
107 {0xfc, 0x80, 0x80, 0x80, 0x80, 0x80}
108 };
109
110 char single_surrogates[7][3] = {
111 {0xed, 0xa0, 0x80},
112 {0xed, 0xad, 0xbf},
113 {0xed, 0xae, 0x80},
114 {0xed, 0xaf, 0xbf},
115 {0xed, 0xb0, 0x80},
116 {0xed, 0xbe, 0x80},
117 {0xed, 0xbf, 0xbf}
118 };
119
120 char paired_surrogates[8][6] = {
121 {0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80},
122 {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf},
123 {0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80},
124 {0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf},
125 {0xed, 0xae, 0x80, 0xed, 0xb0, 0x80},
126 {0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf},
127 {0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80},
128 {0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf}
129 };
130
131 char illegal_pos[2][3] = {
132 {0xff, 0xfe},
133 {0xff, 0xff}
134 };
135
main(void)136 int main(void)
137 {
138 wchar_t wchar;
139 int retval;
140 int i;
141
142 if (!setlocale(LC_CTYPE, "C-UTF-8"))
143 {
144 printf("Failed to set C-UTF-8 locale.\n");
145 return 1;
146 }
147 else
148 printf("Set C-UTF-8 locale.\n");
149
150 /* 2 Boundary condition test cases */
151 /* 2.1 First possible sequence of a certain length */
152 retval = mbtowc(&wchar, first[0], MAX_BYTES);
153 if (retval == 0)
154 printf("2.1.1: U-%08ld\n", (long) wchar);
155 else
156 printf("2.1.1: Invalid\n");
157
158 for (i = 2; i < 7; i++)
159 {
160 retval = mbtowc (&wchar, first[i-1], MAX_BYTES);
161 if (retval == i)
162 printf("2.1.%d: U-%08lx\n", i, (long) wchar);
163 else
164 printf("2.1.%d: Invalid\n", i);
165 }
166
167 /* 2.2 Last possible sequence of a certain length */
168 for (i = 1; i < 7; i++)
169 {
170 retval = mbtowc (&wchar, last[i-1], MAX_BYTES);
171 if (retval == i)
172 printf("2.2.%d: U-%08lx\n", i, (long) wchar);
173 else
174 printf("2.2.%d: Invalid\n", i);
175 }
176
177 /* 2.3 Other boundary conditions */
178 for (i = 1; i < 6; i++)
179 {
180 retval = mbtowc (&wchar, boundary[i-1], MAX_BYTES);
181 if ((i < 4 && retval == 3) || (i > 3 && retval == 4))
182 printf("2.3.%d: U-%08lx\n", i, (long) wchar);
183 else
184 printf("2.3.%d: Invalid\n", i);
185 }
186
187 /* 3 Malformed sequences */
188 /* 3.1 Unexpected continuation bytes */
189 retval = mbtowc (&wchar, continuation_bytes[0], MAX_BYTES);
190 if (retval == 1)
191 printf("3.1.1: U-%08lx\n", (long) wchar);
192 else
193 printf("3.1.1: 1 Invalid\n");
194
195 retval = mbtowc (&wchar, continuation_bytes[1], MAX_BYTES);
196 if (retval == 1)
197 printf("3.1.2: U-%08lx\n", (long) wchar);
198 else
199 printf("3.1.2: 1 Invalid\n");
200
201 for(i=2; i< 8; i++)
202 {
203 retval = num_invalid(continuation_bytes[i], i);
204 if (retval == -1)
205 printf("3.1.%d: Valid Character Found\n", i+1);
206 else
207 printf("3.1.%d: %d Invalid\n", i+1, retval);
208 }
209
210 for(i = 0x80; i < 0xc0; i++)
211 all_continuation_bytes[i-0x80] = i;
212
213 retval = num_invalid(all_continuation_bytes, 0xc0 - 0x80);
214 if (retval == -1)
215 printf("3.1.9: Valid Character Found\n");
216 else
217 printf("3.1.9: %d Invalid\n", retval);
218
219 /* 3.2 Lonely start characters */
220 for(i = 0xc0; i < 0xe0; i++)
221 all_two_byte_seq[i-0xc0] = i;
222
223 retval = num_invalid(all_two_byte_seq, 0xe0 - 0xc0);
224 if (retval == -1)
225 printf("3.2.1: Valid Character Found\n");
226 else
227 printf("3.2.1: %d Invalid\n", retval);
228
229 for(i = 0xe0; i < 0xf0; i++)
230 all_three_byte_seq[i-0xe0] = i;
231
232 retval = num_invalid(all_three_byte_seq, 0xf0 - 0xe0);
233 if (retval == -1)
234 printf("3.2.2: Valid Character Found\n");
235 else
236 printf("3.2.2: %d Invalid\n", retval);
237
238 for(i = 0xf0; i < 0xf8; i++)
239 all_four_byte_seq[i-0xf0] = i;
240
241 retval = num_invalid(all_four_byte_seq, 0xf8 - 0xf0);
242 if (retval == -1)
243 printf("3.2.3: Valid Character Found\n");
244 else
245 printf("3.2.3: %d Invalid\n", retval);
246
247 for(i = 0xf8; i < 0xfc; i++)
248 all_five_byte_seq[i-0xf8] = i;
249
250 retval = num_invalid(all_five_byte_seq, 0xfc - 0xf8);
251 if (retval == -1)
252 printf("3.2.4: Valid Character Found\n");
253 else
254 printf("3.2.4: %d Invalid\n", retval);
255
256 for(i = 0xfc; i < 0xfe; i++)
257 all_six_byte_seq[i-0xfc] = i;
258
259 retval = num_invalid(all_six_byte_seq, 0xfe - 0xfc);
260 if (retval == -1)
261 printf("3.2.5: Valid Character Found\n");
262 else
263 printf("3.2.5: %d Invalid\n", retval);
264
265 /* 3.3 Sequences with last continuation byte missing */
266 for(i = 1; i < 6; i++)
267 {
268 retval = mbtowc(&wchar, incomplete_seq[i-1], i);
269 if(retval == -1)
270 printf("3.3.%d: 1 Invalid\n", i);
271 else
272 printf("3.3.%d: Valid Character Found\n", i);
273 }
274
275 for(i = 6; i < 11; i++)
276 {
277 retval = mbtowc(&wchar, incomplete_seq[i-1], i - 5);
278 if(retval == -1)
279 printf("3.3.%d: 1 Invalid\n", i);
280 else
281 printf("3.3.%d: Valid Character Found\n", i);
282 }
283
284 /* 3.4 Concatenation of incomplete sequences */
285 /* This test is excluded because the mbtowc function does not return the
286 number of bytes read in an invalid multi-byte sequence. */
287
288 /* 3.5 Impossible bytes */
289 retval = mbtowc(&wchar, impossible_bytes[0], 1);
290 if(retval == -1)
291 printf("3.5.1: 1 Invalid\n");
292 else
293 printf("3.5.1: Valid Character Found\n");
294
295 retval = mbtowc(&wchar, impossible_bytes[1], 1);
296 if(retval == -1)
297 printf("3.5.2: 1 Invalid\n");
298 else
299 printf("3.5.2: Valid Character Found\n");
300
301 retval = mbtowc(&wchar, impossible_bytes[2], 4);
302 if(retval == -1)
303 printf("3.5.3: 1 Invalid\n");
304 else
305 printf("3.5.3: Valid Character Found\n");
306
307 /* 4 Overlong sequences */
308 /* 4.1 Examples of an overlong ASCII character */
309 for(i = 2; i < 7; i++)
310 {
311 retval = mbtowc(&wchar, overlong[i-2], i);
312 if(retval == -1)
313 printf("4.1.%d: 1 Invalid\n", i-1);
314 else
315 printf("4.1.%d: Valid Character Found\n", i-1);
316 }
317
318 /* 4.2 Maximum overlong sequences */
319 for(i = 2; i < 7; i++)
320 {
321 retval = mbtowc(&wchar, overlong_max[i-2], i);
322 if(retval == -1)
323 printf("4.2.%d: 1 Invalid\n", i-1);
324 else
325 printf("4.2.%d: Valid Character Found\n", i-1);
326 }
327
328 /* 4.3 Overlong representation of the NUL character */
329 for(i = 2; i < 7; i++)
330 {
331 retval = mbtowc(&wchar, overlong_nul[i-2], i);
332 if(retval == -1)
333 printf("4.3.%d: 1 Invalid\n", i-1);
334 else
335 printf("4.3.%d: Valid Character Found\n", i-1);
336 }
337
338 /* 5 Illegal code positions */
339 /* 5.1 Single UTF-16 surrogates */
340 for (i = 1; i < 8; i++)
341 {
342 retval = mbtowc(&wchar, single_surrogates[i-1], 3);
343 if(retval == -1)
344 printf("5.1.%d: 1 Invalid\n", i);
345 else
346 printf("5.1.%d: Valid Character Found\n", i);
347 }
348
349 /* 5.2 Paired UTF-16 surrogates */
350 for (i = 1; i < 8; i++)
351 {
352 retval = mbtowc(&wchar, paired_surrogates[i-1], 6);
353 if(retval == -1)
354 printf("5.2.%d: 1 Invalid\n", i);
355 else
356 printf("5.2.%d: Valid Character Found\n", i);
357 }
358
359 /* 5.3 Other illegal code positions */
360 retval = mbtowc(&wchar, illegal_pos[0], 3);
361 if(retval == -1)
362 printf("5.3.1: 1 Invalid\n");
363 else
364 printf("5.3.1: Valid Character Found\n");
365
366 retval = mbtowc(&wchar, illegal_pos[1], 3);
367 if(retval == -1)
368 printf("5.3.2: 1 Invalid\n");
369 else
370 printf("5.3.2: Valid Character Found\n");
371
372 return 0;
373 }
374
375 /* return number of invalid characters in string,
376 returns -1 if a valid character is found */
377 int
num_invalid(const char * s,int len)378 num_invalid(const char *s, int len)
379 {
380 int retval = 0;
381 int i = 0;
382 int num_inv = 0;
383 wchar_t wchar;
384 const char *t;
385
386 t = s;
387
388 for(i=0; i<len; t++, i++)
389 {
390 retval = mbtowc (&wchar, t, len - i);
391 if(retval == -1)
392 num_inv++;
393 else
394 return -1;
395 }
396 return num_inv;
397 }
398