1 /*
2  *  Copyright (C) 2002 by Red Hat, Incorporated. All rights reserved.
3  *
4  *  Permission to use, copy, modify, and distribute this software
5  *  is freely granted, provided that this notice is preserved.
6  *
7  *  Tests gleaned from Markus Kuhn's UTF-8 and Unicode FAQ,
8  *  and specifically, his UTF-8-test.txt decoder stress test file.
9  */
10 
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <locale.h>
14 
15 #define MAX_BYTES 65
16 
17 int num_invalid(const char *s, int len);
18 
19 char first[6][6] = {
20   {0x0},                                   /* U-00000000 */
21   {0xc2, 0x80},                            /* U-00000080 */
22   {0xe0, 0xa0, 0x80},                      /* U-00000800 */
23   {0xf0, 0x90, 0x80, 0x80},                /* U-00010000 */
24   {0xf8, 0x88, 0x80, 0x80, 0x80},          /* U-00200000 */
25   {0xfc, 0x84, 0x80, 0x80, 0x80, 0x80}     /* U-04000000 */
26 };
27 
28 char last[6][6] = {
29   {0x7f},                                  /* U-0000007F */
30   {0xdf, 0xbf},                            /* U-000007FF */
31   {0xef, 0xbf, 0xbf},                      /* U-0000FFFF */
32   {0xf7, 0xbf, 0xbf, 0xbf},                /* U-001FFFFF */
33   {0xfb, 0xbf, 0xbf, 0xbf, 0xbf},          /* U-03FFFFFF */
34   {0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf}     /* U-7FFFFFFF */
35 };
36 
37 char boundary[5][6] = {
38   {0xed, 0x9f, 0xbf},                      /* U-0000D7FF */
39   {0xee, 0x80, 0x80},                      /* U-0000E000 */
40   {0xef, 0xbf, 0xbd},                      /* U-0000FFFD */
41   {0xf4, 0x8f, 0xbf, 0xbf},                /* U-0010FFFF */
42   {0xf4, 0x90, 0x80, 0x80}                 /* U-00110000 */
43 };
44 
45 char continuation_bytes[8][7] = {
46   {0x80},
47   {0xbf},
48   {0x80, 0xbf},
49   {0x80, 0xbf, 0x80},
50   {0x80, 0xbf, 0x80, 0xbf},
51   {0x80, 0xbf, 0x80, 0xbf, 0x80},
52   {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf},
53   {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0x80}
54 };
55 
56 char all_continuation_bytes[64];
57 
58 
59 char all_two_byte_seq[32];
60 char all_three_byte_seq[16];
61 char all_four_byte_seq[8];
62 char all_five_byte_seq[4];
63 char all_six_byte_seq[2];
64 
65 char incomplete_seq[10][6] = {
66   {0xc2},                            /* U-00000080 */
67   {0xe0, 0x80},                      /* U-00000800 */
68   {0xf0, 0x80, 0x80},                /* U-00010000 */
69   {0xf8, 0x80, 0x80, 0x80},          /* U-00200000 */
70   {0xfc, 0x80, 0x80, 0x80, 0x80},    /* U-04000000 */
71   {0xdf},                            /* U-000007FF */
72   {0xef, 0xbf},                      /* U-0000FFFF */
73   {0xf7, 0xbf, 0xbf},                /* U-001FFFFF */
74   {0xfb, 0xbf, 0xbf, 0xbf},          /* U-03FFFFFF */
75   {0xfd, 0xbf, 0xbf, 0xbf, 0xbf}     /* U-7FFFFFFF */
76 };
77 
78 char incomplete_seq_concat[30];
79 
80 char impossible_bytes[3][4] = {
81   {0xfe},
82   {0xff},
83   {0xfe, 0xfe, 0xff, 0xff}
84 };
85 
86 char overlong[5][6] = {
87   {0xc0, 0xaf},
88   {0xe0, 0x80, 0xaf},
89   {0xf0, 0x80, 0x80, 0xaf},
90   {0xf8, 0x80, 0x80, 0x80, 0xaf},
91   {0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf}
92 };
93 
94 char overlong_max[5][6] = {
95   {0xc1, 0xbf},
96   {0xe0, 0x9f, 0xbf},
97   {0xf0, 0x8f, 0xbf, 0xbf},
98   {0xf8, 0x87, 0xbf, 0xbf, 0xbf},
99   {0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf}
100 };
101 
102 char overlong_nul[5][6] = {
103   {0xc0, 0x80},
104   {0xe0, 0x80, 0x80},
105   {0xf0, 0x80, 0x80, 0x80},
106   {0xf8, 0x80, 0x80, 0x80, 0x80},
107   {0xfc, 0x80, 0x80, 0x80, 0x80, 0x80}
108 };
109 
110 char single_surrogates[7][3] = {
111   {0xed, 0xa0, 0x80},
112   {0xed, 0xad, 0xbf},
113   {0xed, 0xae, 0x80},
114   {0xed, 0xaf, 0xbf},
115   {0xed, 0xb0, 0x80},
116   {0xed, 0xbe, 0x80},
117   {0xed, 0xbf, 0xbf}
118 };
119 
120 char paired_surrogates[8][6] = {
121   {0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80},
122   {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf},
123   {0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80},
124   {0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf},
125   {0xed, 0xae, 0x80, 0xed, 0xb0, 0x80},
126   {0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf},
127   {0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80},
128   {0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf}
129 };
130 
131 char illegal_pos[2][3] = {
132   {0xff, 0xfe},
133   {0xff, 0xff}
134 };
135 
main(void)136 int main(void)
137   {
138     wchar_t wchar;
139     int retval;
140     int i;
141 
142     if (!setlocale(LC_CTYPE, "C-UTF-8"))
143       {
144         printf("Failed to set C-UTF-8 locale.\n");
145         return 1;
146       }
147     else
148       printf("Set C-UTF-8 locale.\n");
149 
150     /* 2  Boundary condition test cases */
151     /* 2.1  First possible sequence of a certain length */
152     retval = mbtowc(&wchar, first[0], MAX_BYTES);
153     if (retval == 0)
154       printf("2.1.1: U-%08ld\n", (long) wchar);
155     else
156       printf("2.1.1: Invalid\n");
157 
158     for (i = 2; i < 7; i++)
159     {
160       retval = mbtowc (&wchar, first[i-1], MAX_BYTES);
161       if (retval == i)
162         printf("2.1.%d: U-%08lx\n", i, (long) wchar);
163       else
164         printf("2.1.%d: Invalid\n", i);
165     }
166 
167     /* 2.2  Last possible sequence of a certain length */
168     for (i = 1; i < 7; i++)
169     {
170       retval = mbtowc (&wchar, last[i-1], MAX_BYTES);
171       if (retval == i)
172         printf("2.2.%d: U-%08lx\n", i, (long) wchar);
173       else
174         printf("2.2.%d: Invalid\n", i);
175     }
176 
177     /* 2.3  Other boundary conditions */
178     for (i = 1; i < 6; i++)
179       {
180         retval = mbtowc (&wchar, boundary[i-1], MAX_BYTES);
181         if ((i < 4 && retval == 3) || (i > 3 && retval == 4))
182           printf("2.3.%d: U-%08lx\n", i, (long) wchar);
183         else
184           printf("2.3.%d: Invalid\n", i);
185       }
186 
187     /* 3  Malformed sequences */
188     /* 3.1  Unexpected continuation bytes */
189     retval = mbtowc (&wchar, continuation_bytes[0], MAX_BYTES);
190     if (retval == 1)
191       printf("3.1.1: U-%08lx\n", (long) wchar);
192     else
193       printf("3.1.1: 1 Invalid\n");
194 
195     retval = mbtowc (&wchar, continuation_bytes[1], MAX_BYTES);
196     if (retval == 1)
197       printf("3.1.2: U-%08lx\n", (long) wchar);
198     else
199       printf("3.1.2: 1 Invalid\n");
200 
201     for(i=2; i< 8; i++)
202       {
203         retval = num_invalid(continuation_bytes[i], i);
204         if (retval == -1)
205           printf("3.1.%d: Valid Character Found\n", i+1);
206         else
207           printf("3.1.%d: %d Invalid\n", i+1, retval);
208       }
209 
210     for(i = 0x80; i < 0xc0; i++)
211       all_continuation_bytes[i-0x80] = i;
212 
213     retval = num_invalid(all_continuation_bytes, 0xc0 - 0x80);
214     if (retval == -1)
215       printf("3.1.9: Valid Character Found\n");
216     else
217       printf("3.1.9: %d Invalid\n", retval);
218 
219     /* 3.2  Lonely start characters */
220     for(i = 0xc0; i < 0xe0; i++)
221       all_two_byte_seq[i-0xc0] = i;
222 
223     retval = num_invalid(all_two_byte_seq, 0xe0 - 0xc0);
224     if (retval == -1)
225       printf("3.2.1: Valid Character Found\n");
226     else
227       printf("3.2.1: %d Invalid\n", retval);
228 
229     for(i = 0xe0; i < 0xf0; i++)
230       all_three_byte_seq[i-0xe0] = i;
231 
232     retval = num_invalid(all_three_byte_seq, 0xf0 - 0xe0);
233     if (retval == -1)
234       printf("3.2.2: Valid Character Found\n");
235     else
236       printf("3.2.2: %d Invalid\n", retval);
237 
238     for(i = 0xf0; i < 0xf8; i++)
239       all_four_byte_seq[i-0xf0] = i;
240 
241     retval = num_invalid(all_four_byte_seq, 0xf8 - 0xf0);
242     if (retval == -1)
243       printf("3.2.3: Valid Character Found\n");
244     else
245       printf("3.2.3: %d Invalid\n", retval);
246 
247     for(i = 0xf8; i < 0xfc; i++)
248       all_five_byte_seq[i-0xf8] = i;
249 
250     retval = num_invalid(all_five_byte_seq, 0xfc - 0xf8);
251     if (retval == -1)
252       printf("3.2.4: Valid Character Found\n");
253     else
254       printf("3.2.4: %d Invalid\n", retval);
255 
256     for(i = 0xfc; i < 0xfe; i++)
257       all_six_byte_seq[i-0xfc] = i;
258 
259     retval = num_invalid(all_six_byte_seq, 0xfe - 0xfc);
260     if (retval == -1)
261       printf("3.2.5: Valid Character Found\n");
262     else
263       printf("3.2.5: %d Invalid\n", retval);
264 
265     /* 3.3  Sequences with last continuation byte missing */
266     for(i = 1; i < 6; i++)
267       {
268         retval = mbtowc(&wchar, incomplete_seq[i-1], i);
269         if(retval == -1)
270           printf("3.3.%d: 1 Invalid\n", i);
271         else
272           printf("3.3.%d: Valid Character Found\n", i);
273       }
274 
275     for(i = 6; i < 11; i++)
276       {
277         retval = mbtowc(&wchar, incomplete_seq[i-1], i - 5);
278         if(retval == -1)
279           printf("3.3.%d: 1 Invalid\n", i);
280         else
281           printf("3.3.%d: Valid Character Found\n", i);
282       }
283 
284     /* 3.4  Concatenation of incomplete sequences */
285     /* This test is excluded because the mbtowc function does not return the
286        number of bytes read in an invalid multi-byte sequence. */
287 
288     /* 3.5  Impossible bytes */
289     retval = mbtowc(&wchar, impossible_bytes[0], 1);
290     if(retval == -1)
291       printf("3.5.1: 1 Invalid\n");
292     else
293       printf("3.5.1: Valid Character Found\n");
294 
295     retval = mbtowc(&wchar, impossible_bytes[1], 1);
296     if(retval == -1)
297       printf("3.5.2: 1 Invalid\n");
298     else
299       printf("3.5.2: Valid Character Found\n");
300 
301     retval = mbtowc(&wchar, impossible_bytes[2], 4);
302     if(retval == -1)
303       printf("3.5.3: 1 Invalid\n");
304     else
305       printf("3.5.3: Valid Character Found\n");
306 
307     /* 4  Overlong sequences */
308     /* 4.1  Examples of an overlong ASCII character */
309     for(i = 2; i < 7; i++)
310       {
311         retval = mbtowc(&wchar, overlong[i-2], i);
312         if(retval == -1)
313           printf("4.1.%d: 1 Invalid\n", i-1);
314         else
315           printf("4.1.%d: Valid Character Found\n", i-1);
316       }
317 
318     /* 4.2  Maximum overlong sequences */
319     for(i = 2; i < 7; i++)
320       {
321         retval = mbtowc(&wchar, overlong_max[i-2], i);
322         if(retval == -1)
323           printf("4.2.%d: 1 Invalid\n", i-1);
324         else
325           printf("4.2.%d: Valid Character Found\n", i-1);
326       }
327 
328     /* 4.3  Overlong representation of the NUL character */
329     for(i = 2; i < 7; i++)
330       {
331         retval = mbtowc(&wchar, overlong_nul[i-2], i);
332         if(retval == -1)
333           printf("4.3.%d: 1 Invalid\n", i-1);
334         else
335           printf("4.3.%d: Valid Character Found\n", i-1);
336       }
337 
338     /* 5  Illegal code positions */
339     /* 5.1 Single UTF-16 surrogates */
340     for (i = 1; i < 8; i++)
341       {
342         retval = mbtowc(&wchar, single_surrogates[i-1], 3);
343         if(retval == -1)
344           printf("5.1.%d: 1 Invalid\n", i);
345         else
346           printf("5.1.%d: Valid Character Found\n", i);
347       }
348 
349     /* 5.2 Paired UTF-16 surrogates */
350     for (i = 1; i < 8; i++)
351       {
352         retval = mbtowc(&wchar, paired_surrogates[i-1], 6);
353         if(retval == -1)
354           printf("5.2.%d: 1 Invalid\n", i);
355         else
356           printf("5.2.%d: Valid Character Found\n", i);
357       }
358 
359     /* 5.3 Other illegal code positions */
360     retval = mbtowc(&wchar, illegal_pos[0], 3);
361     if(retval == -1)
362       printf("5.3.1: 1 Invalid\n");
363     else
364       printf("5.3.1: Valid Character Found\n");
365 
366     retval = mbtowc(&wchar, illegal_pos[1], 3);
367     if(retval == -1)
368       printf("5.3.2: 1 Invalid\n");
369     else
370       printf("5.3.2: Valid Character Found\n");
371 
372     return 0;
373   }
374 
375 /* return number of invalid characters in string,
376    returns -1 if a valid character is found */
377 int
num_invalid(const char * s,int len)378 num_invalid(const char *s, int len)
379 {
380   int retval = 0;
381   int i = 0;
382   int num_inv = 0;
383   wchar_t wchar;
384   const char *t;
385 
386   t = s;
387 
388   for(i=0; i<len; t++, i++)
389     {
390       retval = mbtowc (&wchar, t, len - i);
391       if(retval == -1)
392         num_inv++;
393       else
394         return -1;
395     }
396   return num_inv;
397 }
398