1 /*
2  * Copyright (c) 2003-2004, Artem B. Bityuckiy
3  * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 #include "cesbi.h"
27 
28 #if defined (ICONV_TO_UCS_CES_UTF_8) \
29  || defined (ICONV_FROM_UCS_CES_UTF_8)
30 
31 #include <sys/types.h>
32 #include "../lib/local.h"
33 #include "../lib/ucsconv.h"
34 
35 #define UTF8_MB_CUR_MAX 6
36 
37 /*
38  * UTF-8 CES converter doesn't interpret BOM. Reject overlong sequences,
39  * U'FFFF, U'FFFE codes, UTF-16 surrogate codes and all codes > 0x7FFFFFFF.
40  */
41 
42 #if defined (ICONV_FROM_UCS_CES_UTF_8)
43 static size_t
convert_from_ucs(void * data,register ucs4_t in,unsigned char ** outbuf,size_t * outbytesleft)44 convert_from_ucs (void *data,
45                          register ucs4_t in,
46                          unsigned char **outbuf,
47                          size_t *outbytesleft)
48 {
49   register unsigned char *cp;
50   register size_t bytes;
51 
52   (void) data;
53   if ((in  >= 0x0000D800 && in <= 0x0000DFFF)
54       || in > 0x7FFFFFFF || in == 0x0000FFFF || in == 0x0000FFFE)
55     return (size_t)ICONV_CES_INVALID_CHARACTER;
56 
57   if (in < 0x80)
58     bytes = 1;
59   else if (in < 0x800)
60     bytes = 2;
61   else if (in < 0x10000)
62     bytes = 3;
63   else if (in < 0x200000)
64     bytes = 4;
65   else if (in < 0x4000000)
66     bytes = 5;
67   else
68     bytes = 6;
69 
70   if (*outbytesleft < bytes)
71     return (size_t)ICONV_CES_NOSPACE;
72 
73   cp = *outbuf;
74 
75   switch (bytes)
76     {
77       case 1:
78         *cp = (unsigned char)in;
79         break;
80 
81       case 2:
82         *cp++ = (unsigned char)((in >> 6) | 0x000000C0);
83         *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
84         break;
85 
86       case 3:
87         *cp++ = (unsigned char)((in >> 12) | 0x000000E0);
88         *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
89         *cp++ = (unsigned char)((in        & 0x0000003F) | 0x00000080);
90         break;
91 
92       case 4:
93         *cp++ = (unsigned char)((in >> 18)  | 0x000000F0);
94         *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
95         *cp++ = (unsigned char)(((in >> 6)  & 0x0000003F) | 0x00000080);
96         *cp++ = (unsigned char)((in         & 0x0000003F) | 0x00000080);
97         break;
98 
99       case 5:
100         *cp++ = (unsigned char)((in >> 24)  | 0x000000F8);
101         *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080);
102         *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
103         *cp++ = (unsigned char)(((in >> 6)  & 0x0000003F) | 0x00000080);
104         *cp++ = (unsigned char)((in         & 0x0000003F) | 0x00000080);
105         break;
106 
107       case 6:
108         *cp++ = (unsigned char)((in >> 30)  | 0x000000FC);
109         *cp++ = (unsigned char)(((in >> 24) & 0x0000003F) | 0x00000080);
110         *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080);
111         *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
112         *cp++ = (unsigned char)(((in >> 6)  & 0x0000003F) | 0x00000080);
113         *cp++ = (unsigned char)((in         & 0x0000003F) | 0x00000080);
114         break;
115     }
116 
117   *outbytesleft -= bytes;
118   *outbuf += bytes;
119 
120   return bytes;
121 }
122 #endif /* ICONV_FROM_UCS_CES_UTF_8 */
123 
124 #if defined (ICONV_TO_UCS_CES_UTF_8)
125 static ucs4_t
convert_to_ucs(void * data,const unsigned char ** inbuf,size_t * inbytesleft)126 convert_to_ucs (void *data,
127                        const unsigned char **inbuf,
128                        size_t *inbytesleft)
129 {
130   register const unsigned char *in = *inbuf;
131   register size_t bytes;
132   ucs4_t res;
133 
134   (void) data;
135   if (in[0] >= 0xC0)
136     {
137       if (in[0] < 0xE0)
138         {
139           if (*inbytesleft < (bytes = 2))
140             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
141 
142           if (   ((in[0] & ~0x1F) == 0xC0)
143               && ((in[1] & 0xC0)  == 0x80))
144             res = ((ucs4_t)(in[0] & 0x1F) << 6)
145                 | ((ucs4_t)(in[1] & 0x3F));
146           else
147             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
148 
149           if (res < 0x00000080) /* Overlong sequence */
150             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
151         }
152 
153       else if (in[0] < 0xF0)
154         {
155           if (*inbytesleft < (bytes = 3))
156             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
157 
158           if (   ((in[0] & ~0x0F) == 0xE0)
159               && ((in[1] & 0xC0)  == 0x80)
160               && ((in[2] & 0xC0)  == 0x80))
161             res = ((ucs4_t)(in[0] & 0x0F) << 12)
162                 | ((ucs4_t)(in[1] & 0x3F) << 6)
163                 | ((ucs4_t)(in[2] & 0x3F));
164           else
165             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
166 
167           if (res < 0x00000800) /* Overlong sequence */
168             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
169         }
170 
171       else if (in[0] < 0xF8)
172         {
173           if (*inbytesleft < (bytes = 4))
174             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
175 
176           if (   ((in[0] & ~0x07) == 0xF0)
177               && ((in[1] & 0xC0)  == 0x80)
178               && ((in[2] & 0xC0)  == 0x80)
179               && ((in[3] & 0xC0)  == 0x80))
180             res = ((ucs4_t)(in[0] & 0x07) << 18)
181                 | ((ucs4_t)(in[1] & 0x3F) << 12)
182                 | ((ucs4_t)(in[2] & 0x3F) << 6)
183                 | ((ucs4_t)(in[3] & 0x3F));
184           else
185             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
186 
187           if (res < 0x00010000) /* Overlong sequence */
188             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
189         }
190 
191       else if (in[0] < 0xFC)
192         {
193           if (*inbytesleft < (bytes = 5))
194             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
195 
196           if (   ((in[0] & ~0x03) == 0xF8)
197               && ((in[1] & 0xC0)  == 0x80)
198               && ((in[2] & 0xC0)  == 0x80)
199               && ((in[3] & 0xC0)  == 0x80)
200               && ((in[4] & 0xC0)  == 0x80))
201             res = ((ucs4_t)(in[0] & 0x03) << 24)
202                 | ((ucs4_t)(in[1] & 0x3F) << 18)
203                 | ((ucs4_t)(in[2] & 0x3F) << 12)
204                 | ((ucs4_t)(in[3] & 0x3F) << 6)
205                 | ((ucs4_t)(in[4] & 0x3F));
206           else
207             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
208 
209           if (res < 0x00200000) /* Overlong sequence */
210             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
211         }
212 
213       else if (in[0] <= 0xFD)
214         {
215           if (*inbytesleft < (bytes = 6))
216             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
217 
218           if (   ((in[0] & ~0x01) == 0xFC)
219               && ((in[1] & 0xC0)  == 0x80)
220               && ((in[2] & 0xC0)  == 0x80)
221               && ((in[3] & 0xC0)  == 0x80)
222               && ((in[4] & 0xC0)  == 0x80)
223               && ((in[5] & 0xC0)  == 0x80))
224               res = ((ucs4_t)(in[0] & 0x1)  << 30)
225                   | ((ucs4_t)(in[1] & 0x3F) << 24)
226                   | ((ucs4_t)(in[2] & 0x3F) << 18)
227                   | ((ucs4_t)(in[3] & 0x3F) << 12)
228                   | ((ucs4_t)(in[4] & 0x3F) << 6)
229                   | ((ucs4_t)(in[5] & 0x3F));
230           else
231             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
232 
233           if (res < 0x04000000) /* Overlong sequence */
234             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
235         }
236 
237       else
238         return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
239     }
240   else if (in[0] & 0x80)
241     return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
242   else
243     {
244       res = (ucs4_t)in[0];
245       bytes = 1;
246     }
247 
248   if (  (res  >= 0x0000D800 && res <= 0x0000DFFF)
249       || res > 0x7FFFFFFF || res == 0x0000FFFF || res == 0x0000FFFE)
250     return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
251 
252   *inbytesleft -= bytes;
253   *inbuf += bytes;
254 
255   return res;
256 }
257 #endif /* ICONV_TO_UCS_CES_UTF_8 */
258 
259 static int
get_mb_cur_max(void * data)260 get_mb_cur_max (void *data)
261 {
262   (void) data;
263   return UTF8_MB_CUR_MAX;
264 }
265 
266 #if defined (ICONV_TO_UCS_CES_UTF_8)
267 const iconv_to_ucs_ces_handlers_t
268 _iconv_to_ucs_ces_handlers_utf_8 =
269 {
270   NULL,
271   NULL,
272   get_mb_cur_max,
273   NULL,
274   NULL,
275   NULL,
276   convert_to_ucs
277 };
278 #endif
279 
280 #if defined (ICONV_FROM_UCS_CES_UTF_8)
281 const iconv_from_ucs_ces_handlers_t
282 _iconv_from_ucs_ces_handlers_utf_8 =
283 {
284   NULL,
285   NULL,
286   get_mb_cur_max,
287   NULL,
288   NULL,
289   NULL,
290   convert_from_ucs
291 };
292 #endif
293 
294 #endif /* ICONV_TO_UCS_CES_UTF_8 || ICONV_FROM_UCS_CES_UTF_8 */
295 
296