1 /*
2 * Copyright (c) 2003-2004, Artem B. Bityuckiy
3 * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 #include "cesbi.h"
27
28 #if defined (ICONV_TO_UCS_CES_UTF_8) \
29 || defined (ICONV_FROM_UCS_CES_UTF_8)
30
31 #include <sys/types.h>
32 #include "../lib/local.h"
33 #include "../lib/ucsconv.h"
34
35 #define UTF8_MB_CUR_MAX 6
36
37 /*
38 * UTF-8 CES converter doesn't interpret BOM. Reject overlong sequences,
39 * U'FFFF, U'FFFE codes, UTF-16 surrogate codes and all codes > 0x7FFFFFFF.
40 */
41
42 #if defined (ICONV_FROM_UCS_CES_UTF_8)
43 static size_t
convert_from_ucs(void * data,register ucs4_t in,unsigned char ** outbuf,size_t * outbytesleft)44 convert_from_ucs (void *data,
45 register ucs4_t in,
46 unsigned char **outbuf,
47 size_t *outbytesleft)
48 {
49 register unsigned char *cp;
50 register size_t bytes;
51
52 (void) data;
53 if ((in >= 0x0000D800 && in <= 0x0000DFFF)
54 || in > 0x7FFFFFFF || in == 0x0000FFFF || in == 0x0000FFFE)
55 return (size_t)ICONV_CES_INVALID_CHARACTER;
56
57 if (in < 0x80)
58 bytes = 1;
59 else if (in < 0x800)
60 bytes = 2;
61 else if (in < 0x10000)
62 bytes = 3;
63 else if (in < 0x200000)
64 bytes = 4;
65 else if (in < 0x4000000)
66 bytes = 5;
67 else
68 bytes = 6;
69
70 if (*outbytesleft < bytes)
71 return (size_t)ICONV_CES_NOSPACE;
72
73 cp = *outbuf;
74
75 switch (bytes)
76 {
77 case 1:
78 *cp = (unsigned char)in;
79 break;
80
81 case 2:
82 *cp++ = (unsigned char)((in >> 6) | 0x000000C0);
83 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
84 break;
85
86 case 3:
87 *cp++ = (unsigned char)((in >> 12) | 0x000000E0);
88 *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
89 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
90 break;
91
92 case 4:
93 *cp++ = (unsigned char)((in >> 18) | 0x000000F0);
94 *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
95 *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
96 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
97 break;
98
99 case 5:
100 *cp++ = (unsigned char)((in >> 24) | 0x000000F8);
101 *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080);
102 *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
103 *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
104 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
105 break;
106
107 case 6:
108 *cp++ = (unsigned char)((in >> 30) | 0x000000FC);
109 *cp++ = (unsigned char)(((in >> 24) & 0x0000003F) | 0x00000080);
110 *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080);
111 *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
112 *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
113 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
114 break;
115 }
116
117 *outbytesleft -= bytes;
118 *outbuf += bytes;
119
120 return bytes;
121 }
122 #endif /* ICONV_FROM_UCS_CES_UTF_8 */
123
124 #if defined (ICONV_TO_UCS_CES_UTF_8)
125 static ucs4_t
convert_to_ucs(void * data,const unsigned char ** inbuf,size_t * inbytesleft)126 convert_to_ucs (void *data,
127 const unsigned char **inbuf,
128 size_t *inbytesleft)
129 {
130 register const unsigned char *in = *inbuf;
131 register size_t bytes;
132 ucs4_t res;
133
134 (void) data;
135 if (in[0] >= 0xC0)
136 {
137 if (in[0] < 0xE0)
138 {
139 if (*inbytesleft < (bytes = 2))
140 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
141
142 if ( ((in[0] & ~0x1F) == 0xC0)
143 && ((in[1] & 0xC0) == 0x80))
144 res = ((ucs4_t)(in[0] & 0x1F) << 6)
145 | ((ucs4_t)(in[1] & 0x3F));
146 else
147 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
148
149 if (res < 0x00000080) /* Overlong sequence */
150 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
151 }
152
153 else if (in[0] < 0xF0)
154 {
155 if (*inbytesleft < (bytes = 3))
156 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
157
158 if ( ((in[0] & ~0x0F) == 0xE0)
159 && ((in[1] & 0xC0) == 0x80)
160 && ((in[2] & 0xC0) == 0x80))
161 res = ((ucs4_t)(in[0] & 0x0F) << 12)
162 | ((ucs4_t)(in[1] & 0x3F) << 6)
163 | ((ucs4_t)(in[2] & 0x3F));
164 else
165 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
166
167 if (res < 0x00000800) /* Overlong sequence */
168 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
169 }
170
171 else if (in[0] < 0xF8)
172 {
173 if (*inbytesleft < (bytes = 4))
174 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
175
176 if ( ((in[0] & ~0x07) == 0xF0)
177 && ((in[1] & 0xC0) == 0x80)
178 && ((in[2] & 0xC0) == 0x80)
179 && ((in[3] & 0xC0) == 0x80))
180 res = ((ucs4_t)(in[0] & 0x07) << 18)
181 | ((ucs4_t)(in[1] & 0x3F) << 12)
182 | ((ucs4_t)(in[2] & 0x3F) << 6)
183 | ((ucs4_t)(in[3] & 0x3F));
184 else
185 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
186
187 if (res < 0x00010000) /* Overlong sequence */
188 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
189 }
190
191 else if (in[0] < 0xFC)
192 {
193 if (*inbytesleft < (bytes = 5))
194 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
195
196 if ( ((in[0] & ~0x03) == 0xF8)
197 && ((in[1] & 0xC0) == 0x80)
198 && ((in[2] & 0xC0) == 0x80)
199 && ((in[3] & 0xC0) == 0x80)
200 && ((in[4] & 0xC0) == 0x80))
201 res = ((ucs4_t)(in[0] & 0x03) << 24)
202 | ((ucs4_t)(in[1] & 0x3F) << 18)
203 | ((ucs4_t)(in[2] & 0x3F) << 12)
204 | ((ucs4_t)(in[3] & 0x3F) << 6)
205 | ((ucs4_t)(in[4] & 0x3F));
206 else
207 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
208
209 if (res < 0x00200000) /* Overlong sequence */
210 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
211 }
212
213 else if (in[0] <= 0xFD)
214 {
215 if (*inbytesleft < (bytes = 6))
216 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
217
218 if ( ((in[0] & ~0x01) == 0xFC)
219 && ((in[1] & 0xC0) == 0x80)
220 && ((in[2] & 0xC0) == 0x80)
221 && ((in[3] & 0xC0) == 0x80)
222 && ((in[4] & 0xC0) == 0x80)
223 && ((in[5] & 0xC0) == 0x80))
224 res = ((ucs4_t)(in[0] & 0x1) << 30)
225 | ((ucs4_t)(in[1] & 0x3F) << 24)
226 | ((ucs4_t)(in[2] & 0x3F) << 18)
227 | ((ucs4_t)(in[3] & 0x3F) << 12)
228 | ((ucs4_t)(in[4] & 0x3F) << 6)
229 | ((ucs4_t)(in[5] & 0x3F));
230 else
231 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
232
233 if (res < 0x04000000) /* Overlong sequence */
234 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
235 }
236
237 else
238 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
239 }
240 else if (in[0] & 0x80)
241 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
242 else
243 {
244 res = (ucs4_t)in[0];
245 bytes = 1;
246 }
247
248 if ( (res >= 0x0000D800 && res <= 0x0000DFFF)
249 || res > 0x7FFFFFFF || res == 0x0000FFFF || res == 0x0000FFFE)
250 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
251
252 *inbytesleft -= bytes;
253 *inbuf += bytes;
254
255 return res;
256 }
257 #endif /* ICONV_TO_UCS_CES_UTF_8 */
258
259 static int
get_mb_cur_max(void * data)260 get_mb_cur_max (void *data)
261 {
262 (void) data;
263 return UTF8_MB_CUR_MAX;
264 }
265
266 #if defined (ICONV_TO_UCS_CES_UTF_8)
267 const iconv_to_ucs_ces_handlers_t
268 _iconv_to_ucs_ces_handlers_utf_8 =
269 {
270 NULL,
271 NULL,
272 get_mb_cur_max,
273 NULL,
274 NULL,
275 NULL,
276 convert_to_ucs
277 };
278 #endif
279
280 #if defined (ICONV_FROM_UCS_CES_UTF_8)
281 const iconv_from_ucs_ces_handlers_t
282 _iconv_from_ucs_ces_handlers_utf_8 =
283 {
284 NULL,
285 NULL,
286 get_mb_cur_max,
287 NULL,
288 NULL,
289 NULL,
290 convert_from_ucs
291 };
292 #endif
293
294 #endif /* ICONV_TO_UCS_CES_UTF_8 || ICONV_FROM_UCS_CES_UTF_8 */
295
296