1 /*
2 * Copyright (c) 2003-2004, Artem B. Bityuckiy
3 * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 #include "cesbi.h"
27
28 #if defined (ICONV_TO_UCS_CES_UTF_8) \
29 || defined (ICONV_FROM_UCS_CES_UTF_8)
30
31 #include <_ansi.h>
32 #include <sys/types.h>
33 #include "../lib/local.h"
34 #include "../lib/ucsconv.h"
35
36 #define UTF8_MB_CUR_MAX 6
37
38 /*
39 * UTF-8 CES converter doesn't interpret BOM. Reject overlong sequences,
40 * U'FFFF, U'FFFE codes, UTF-16 surrogate codes and all codes > 0x7FFFFFFF.
41 */
42
43 #if defined (ICONV_FROM_UCS_CES_UTF_8)
44 static size_t
convert_from_ucs(void * data,register ucs4_t in,unsigned char ** outbuf,size_t * outbytesleft)45 convert_from_ucs (void *data,
46 register ucs4_t in,
47 unsigned char **outbuf,
48 size_t *outbytesleft)
49 {
50 register unsigned char *cp;
51 register size_t bytes;
52
53 (void) data;
54 if ((in >= 0x0000D800 && in <= 0x0000DFFF)
55 || in > 0x7FFFFFFF || in == 0x0000FFFF || in == 0x0000FFFE)
56 return (size_t)ICONV_CES_INVALID_CHARACTER;
57
58 if (in < 0x80)
59 bytes = 1;
60 else if (in < 0x800)
61 bytes = 2;
62 else if (in < 0x10000)
63 bytes = 3;
64 else if (in < 0x200000)
65 bytes = 4;
66 else if (in < 0x4000000)
67 bytes = 5;
68 else
69 bytes = 6;
70
71 if (*outbytesleft < bytes)
72 return (size_t)ICONV_CES_NOSPACE;
73
74 cp = *outbuf;
75
76 switch (bytes)
77 {
78 case 1:
79 *cp = (unsigned char)in;
80 break;
81
82 case 2:
83 *cp++ = (unsigned char)((in >> 6) | 0x000000C0);
84 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
85 break;
86
87 case 3:
88 *cp++ = (unsigned char)((in >> 12) | 0x000000E0);
89 *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
90 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
91 break;
92
93 case 4:
94 *cp++ = (unsigned char)((in >> 18) | 0x000000F0);
95 *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
96 *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
97 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
98 break;
99
100 case 5:
101 *cp++ = (unsigned char)((in >> 24) | 0x000000F8);
102 *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080);
103 *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
104 *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
105 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
106 break;
107
108 case 6:
109 *cp++ = (unsigned char)((in >> 30) | 0x000000FC);
110 *cp++ = (unsigned char)(((in >> 24) & 0x0000003F) | 0x00000080);
111 *cp++ = (unsigned char)(((in >> 18) & 0x0000003F) | 0x00000080);
112 *cp++ = (unsigned char)(((in >> 12) & 0x0000003F) | 0x00000080);
113 *cp++ = (unsigned char)(((in >> 6) & 0x0000003F) | 0x00000080);
114 *cp++ = (unsigned char)((in & 0x0000003F) | 0x00000080);
115 break;
116 }
117
118 *outbytesleft -= bytes;
119 *outbuf += bytes;
120
121 return bytes;
122 }
123 #endif /* ICONV_FROM_UCS_CES_UTF_8 */
124
125 #if defined (ICONV_TO_UCS_CES_UTF_8)
126 static ucs4_t
convert_to_ucs(void * data,const unsigned char ** inbuf,size_t * inbytesleft)127 convert_to_ucs (void *data,
128 const unsigned char **inbuf,
129 size_t *inbytesleft)
130 {
131 register const unsigned char *in = *inbuf;
132 register size_t bytes;
133 ucs4_t res;
134
135 (void) data;
136 if (in[0] >= 0xC0)
137 {
138 if (in[0] < 0xE0)
139 {
140 if (*inbytesleft < (bytes = 2))
141 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
142
143 if ( ((in[0] & ~0x1F) == 0xC0)
144 && ((in[1] & 0xC0) == 0x80))
145 res = ((ucs4_t)(in[0] & 0x1F) << 6)
146 | ((ucs4_t)(in[1] & 0x3F));
147 else
148 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
149
150 if (res < 0x00000080) /* Overlong sequence */
151 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
152 }
153
154 else if (in[0] < 0xF0)
155 {
156 if (*inbytesleft < (bytes = 3))
157 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
158
159 if ( ((in[0] & ~0x0F) == 0xE0)
160 && ((in[1] & 0xC0) == 0x80)
161 && ((in[2] & 0xC0) == 0x80))
162 res = ((ucs4_t)(in[0] & 0x0F) << 12)
163 | ((ucs4_t)(in[1] & 0x3F) << 6)
164 | ((ucs4_t)(in[2] & 0x3F));
165 else
166 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
167
168 if (res < 0x00000800) /* Overlong sequence */
169 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
170 }
171
172 else if (in[0] < 0xF8)
173 {
174 if (*inbytesleft < (bytes = 4))
175 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
176
177 if ( ((in[0] & ~0x07) == 0xF0)
178 && ((in[1] & 0xC0) == 0x80)
179 && ((in[2] & 0xC0) == 0x80)
180 && ((in[3] & 0xC0) == 0x80))
181 res = ((ucs4_t)(in[0] & 0x07) << 18)
182 | ((ucs4_t)(in[1] & 0x3F) << 12)
183 | ((ucs4_t)(in[2] & 0x3F) << 6)
184 | ((ucs4_t)(in[3] & 0x3F));
185 else
186 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
187
188 if (res < 0x00010000) /* Overlong sequence */
189 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
190 }
191
192 else if (in[0] < 0xFC)
193 {
194 if (*inbytesleft < (bytes = 5))
195 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
196
197 if ( ((in[0] & ~0x03) == 0xF8)
198 && ((in[1] & 0xC0) == 0x80)
199 && ((in[2] & 0xC0) == 0x80)
200 && ((in[3] & 0xC0) == 0x80)
201 && ((in[4] & 0xC0) == 0x80))
202 res = ((ucs4_t)(in[0] & 0x03) << 24)
203 | ((ucs4_t)(in[1] & 0x3F) << 18)
204 | ((ucs4_t)(in[2] & 0x3F) << 12)
205 | ((ucs4_t)(in[3] & 0x3F) << 6)
206 | ((ucs4_t)(in[4] & 0x3F));
207 else
208 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
209
210 if (res < 0x00200000) /* Overlong sequence */
211 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
212 }
213
214 else if (in[0] <= 0xFD)
215 {
216 if (*inbytesleft < (bytes = 6))
217 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
218
219 if ( ((in[0] & ~0x01) == 0xFC)
220 && ((in[1] & 0xC0) == 0x80)
221 && ((in[2] & 0xC0) == 0x80)
222 && ((in[3] & 0xC0) == 0x80)
223 && ((in[4] & 0xC0) == 0x80)
224 && ((in[5] & 0xC0) == 0x80))
225 res = ((ucs4_t)(in[0] & 0x1) << 30)
226 | ((ucs4_t)(in[1] & 0x3F) << 24)
227 | ((ucs4_t)(in[2] & 0x3F) << 18)
228 | ((ucs4_t)(in[3] & 0x3F) << 12)
229 | ((ucs4_t)(in[4] & 0x3F) << 6)
230 | ((ucs4_t)(in[5] & 0x3F));
231 else
232 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
233
234 if (res < 0x04000000) /* Overlong sequence */
235 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
236 }
237
238 else
239 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
240 }
241 else if (in[0] & 0x80)
242 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
243 else
244 {
245 res = (ucs4_t)in[0];
246 bytes = 1;
247 }
248
249 if ( (res >= 0x0000D800 && res <= 0x0000DFFF)
250 || res > 0x7FFFFFFF || res == 0x0000FFFF || res == 0x0000FFFE)
251 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
252
253 *inbytesleft -= bytes;
254 *inbuf += bytes;
255
256 return res;
257 }
258 #endif /* ICONV_TO_UCS_CES_UTF_8 */
259
260 static int
get_mb_cur_max(void * data)261 get_mb_cur_max (void *data)
262 {
263 (void) data;
264 return UTF8_MB_CUR_MAX;
265 }
266
267 #if defined (ICONV_TO_UCS_CES_UTF_8)
268 const iconv_to_ucs_ces_handlers_t
269 _iconv_to_ucs_ces_handlers_utf_8 =
270 {
271 NULL,
272 NULL,
273 get_mb_cur_max,
274 NULL,
275 NULL,
276 NULL,
277 convert_to_ucs
278 };
279 #endif
280
281 #if defined (ICONV_FROM_UCS_CES_UTF_8)
282 const iconv_from_ucs_ces_handlers_t
283 _iconv_from_ucs_ces_handlers_utf_8 =
284 {
285 NULL,
286 NULL,
287 get_mb_cur_max,
288 NULL,
289 NULL,
290 NULL,
291 convert_from_ucs
292 };
293 #endif
294
295 #endif /* ICONV_TO_UCS_CES_UTF_8 || ICONV_FROM_UCS_CES_UTF_8 */
296
297