1 /*
2  * Copyright (c) 2003-2004, Artem B. Bityuckiy
3  * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 #include "cesbi.h"
27 
28 #if defined (ICONV_TO_UCS_CES_UTF_8) \
29  || defined (ICONV_FROM_UCS_CES_UTF_8)
30 
31 #include <_ansi.h>
32 #include <sys/types.h>
33 #include "../lib/local.h"
34 #include "../lib/ucsconv.h"
35 
36 #define UTF8_MB_CUR_MAX 6
37 
38 /*
39  * UTF-8 CES converter doesn't interpret BOM. Reject overlong sequences,
40  * U'FFFF, U'FFFE codes, UTF-16 surrogate codes and all codes > 0x7FFFFFFF.
41  */
42 
43 #if defined (ICONV_FROM_UCS_CES_UTF_8)
44 static size_t
convert_from_ucs(void * data,register ucs4_t in,unsigned char ** outbuf,size_t * outbytesleft)45 convert_from_ucs (void *data,
46                          register ucs4_t in,
47                          unsigned char **outbuf,
48                          size_t *outbytesleft)
49 {
50   register unsigned char *cp;
51   register size_t bytes;
52 
53   (void) data;
54   if ((in  >= 0x0000D800 && in <= 0x0000DFFF)
55       || in > 0x7FFFFFFF || in == 0x0000FFFF || in == 0x0000FFFE)
56     return (size_t)ICONV_CES_INVALID_CHARACTER;
57 
58   if (in < 0x80)
59     bytes = 1;
60   else if (in < 0x800)
61     bytes = 2;
62   else if (in < 0x10000)
63     bytes = 3;
64   else if (in < 0x200000)
65     bytes = 4;
66   else if (in < 0x4000000)
67     bytes = 5;
68   else
69     bytes = 6;
70 
71   if (*outbytesleft < bytes)
72     return (size_t)ICONV_CES_NOSPACE;
73 
74   cp = *outbuf;
75 
76   switch (bytes)
77     {
78       case 1:
79         *cp = (unsigned char)in;
80         break;
81 
82       case 2:
83         *cp++ = (unsigned char)((in >> 6) | 0x000000C0);
84         *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
85         break;
86 
87       case 3:
88         *cp++ = (unsigned char)((in >> 12) | 0x000000E0);
89         *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
90         *cp++ = (unsigned char)((in        & 0x0000003F) | 0x00000080);
91         break;
92 
93       case 4:
94         *cp++ = (unsigned char)((in >> 18)  | 0x000000F0);
95         *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
96         *cp++ = (unsigned char)(((in >> 6)  & 0x0000003F) | 0x00000080);
97         *cp++ = (unsigned char)((in         & 0x0000003F) | 0x00000080);
98         break;
99 
100       case 5:
101         *cp++ = (unsigned char)((in >> 24)  | 0x000000F8);
102         *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080);
103         *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
104         *cp++ = (unsigned char)(((in >> 6)  & 0x0000003F) | 0x00000080);
105         *cp++ = (unsigned char)((in         & 0x0000003F) | 0x00000080);
106         break;
107 
108       case 6:
109         *cp++ = (unsigned char)((in >> 30)  | 0x000000FC);
110         *cp++ = (unsigned char)(((in >> 24) & 0x0000003F) | 0x00000080);
111         *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080);
112         *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
113         *cp++ = (unsigned char)(((in >> 6)  & 0x0000003F) | 0x00000080);
114         *cp++ = (unsigned char)((in         & 0x0000003F) | 0x00000080);
115         break;
116     }
117 
118   *outbytesleft -= bytes;
119   *outbuf += bytes;
120 
121   return bytes;
122 }
123 #endif /* ICONV_FROM_UCS_CES_UTF_8 */
124 
125 #if defined (ICONV_TO_UCS_CES_UTF_8)
126 static ucs4_t
convert_to_ucs(void * data,const unsigned char ** inbuf,size_t * inbytesleft)127 convert_to_ucs (void *data,
128                        const unsigned char **inbuf,
129                        size_t *inbytesleft)
130 {
131   register const unsigned char *in = *inbuf;
132   register size_t bytes;
133   ucs4_t res;
134 
135   (void) data;
136   if (in[0] >= 0xC0)
137     {
138       if (in[0] < 0xE0)
139         {
140           if (*inbytesleft < (bytes = 2))
141             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
142 
143           if (   ((in[0] & ~0x1F) == 0xC0)
144               && ((in[1] & 0xC0)  == 0x80))
145             res = ((ucs4_t)(in[0] & 0x1F) << 6)
146                 | ((ucs4_t)(in[1] & 0x3F));
147           else
148             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
149 
150           if (res < 0x00000080) /* Overlong sequence */
151             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
152         }
153 
154       else if (in[0] < 0xF0)
155         {
156           if (*inbytesleft < (bytes = 3))
157             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
158 
159           if (   ((in[0] & ~0x0F) == 0xE0)
160               && ((in[1] & 0xC0)  == 0x80)
161               && ((in[2] & 0xC0)  == 0x80))
162             res = ((ucs4_t)(in[0] & 0x0F) << 12)
163                 | ((ucs4_t)(in[1] & 0x3F) << 6)
164                 | ((ucs4_t)(in[2] & 0x3F));
165           else
166             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
167 
168           if (res < 0x00000800) /* Overlong sequence */
169             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
170         }
171 
172       else if (in[0] < 0xF8)
173         {
174           if (*inbytesleft < (bytes = 4))
175             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
176 
177           if (   ((in[0] & ~0x07) == 0xF0)
178               && ((in[1] & 0xC0)  == 0x80)
179               && ((in[2] & 0xC0)  == 0x80)
180               && ((in[3] & 0xC0)  == 0x80))
181             res = ((ucs4_t)(in[0] & 0x07) << 18)
182                 | ((ucs4_t)(in[1] & 0x3F) << 12)
183                 | ((ucs4_t)(in[2] & 0x3F) << 6)
184                 | ((ucs4_t)(in[3] & 0x3F));
185           else
186             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
187 
188           if (res < 0x00010000) /* Overlong sequence */
189             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
190         }
191 
192       else if (in[0] < 0xFC)
193         {
194           if (*inbytesleft < (bytes = 5))
195             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
196 
197           if (   ((in[0] & ~0x03) == 0xF8)
198               && ((in[1] & 0xC0)  == 0x80)
199               && ((in[2] & 0xC0)  == 0x80)
200               && ((in[3] & 0xC0)  == 0x80)
201               && ((in[4] & 0xC0)  == 0x80))
202             res = ((ucs4_t)(in[0] & 0x03) << 24)
203                 | ((ucs4_t)(in[1] & 0x3F) << 18)
204                 | ((ucs4_t)(in[2] & 0x3F) << 12)
205                 | ((ucs4_t)(in[3] & 0x3F) << 6)
206                 | ((ucs4_t)(in[4] & 0x3F));
207           else
208             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
209 
210           if (res < 0x00200000) /* Overlong sequence */
211             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
212         }
213 
214       else if (in[0] <= 0xFD)
215         {
216           if (*inbytesleft < (bytes = 6))
217             return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
218 
219           if (   ((in[0] & ~0x01) == 0xFC)
220               && ((in[1] & 0xC0)  == 0x80)
221               && ((in[2] & 0xC0)  == 0x80)
222               && ((in[3] & 0xC0)  == 0x80)
223               && ((in[4] & 0xC0)  == 0x80)
224               && ((in[5] & 0xC0)  == 0x80))
225               res = ((ucs4_t)(in[0] & 0x1)  << 30)
226                   | ((ucs4_t)(in[1] & 0x3F) << 24)
227                   | ((ucs4_t)(in[2] & 0x3F) << 18)
228                   | ((ucs4_t)(in[3] & 0x3F) << 12)
229                   | ((ucs4_t)(in[4] & 0x3F) << 6)
230                   | ((ucs4_t)(in[5] & 0x3F));
231           else
232             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
233 
234           if (res < 0x04000000) /* Overlong sequence */
235             return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
236         }
237 
238       else
239         return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
240     }
241   else if (in[0] & 0x80)
242     return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
243   else
244     {
245       res = (ucs4_t)in[0];
246       bytes = 1;
247     }
248 
249   if (  (res  >= 0x0000D800 && res <= 0x0000DFFF)
250       || res > 0x7FFFFFFF || res == 0x0000FFFF || res == 0x0000FFFE)
251     return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
252 
253   *inbytesleft -= bytes;
254   *inbuf += bytes;
255 
256   return res;
257 }
258 #endif /* ICONV_TO_UCS_CES_UTF_8 */
259 
260 static int
get_mb_cur_max(void * data)261 get_mb_cur_max (void *data)
262 {
263   (void) data;
264   return UTF8_MB_CUR_MAX;
265 }
266 
267 #if defined (ICONV_TO_UCS_CES_UTF_8)
268 const iconv_to_ucs_ces_handlers_t
269 _iconv_to_ucs_ces_handlers_utf_8 =
270 {
271   NULL,
272   NULL,
273   get_mb_cur_max,
274   NULL,
275   NULL,
276   NULL,
277   convert_to_ucs
278 };
279 #endif
280 
281 #if defined (ICONV_FROM_UCS_CES_UTF_8)
282 const iconv_from_ucs_ces_handlers_t
283 _iconv_from_ucs_ces_handlers_utf_8 =
284 {
285   NULL,
286   NULL,
287   get_mb_cur_max,
288   NULL,
289   NULL,
290   NULL,
291   convert_from_ucs
292 };
293 #endif
294 
295 #endif /* ICONV_TO_UCS_CES_UTF_8 || ICONV_FROM_UCS_CES_UTF_8 */
296 
297