1 /*
2 * Copyright (c) 2003-2004, Artem B. Bityuckiy
3 * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 #include "cesbi.h"
27
28 #if defined (ICONV_TO_UCS_CES_UTF_16) \
29 || defined (ICONV_FROM_UCS_CES_UTF_16)
30
31 #include <_ansi.h>
32 #include <sys/types.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include "../lib/local.h"
37 #include "../lib/ucsconv.h"
38 #include "../lib/endian.h"
39
40 /*
41 * On input UTF-16 converter interpret BOM and uses Big Endian byte order if BOM
42 * is absent. UTF-16 converter outputs in System Endian and adds correspondent
43 * BOM as first code. UTF-16LE and UTF-16BE converters ignore BOM on input and
44 * don't output BOM.
45 */
46
47 #define UTF16_UNDEFINED 0x00
48 #define UTF16_BIG_ENDIAN 0x01
49 #define UTF16_LITTLE_ENDIAN 0x02
50 #define UTF16_SYSTEM_ENDIAN 0x04
51 #define UTF16_BOM_WRITTEN 0x08
52
53 #define UTF16_BOM 0xFEFF
54
55 #define UTF_16 "utf_16"
56 #define UTF_16BE "utf_16be"
57 #define UTF_16LE "utf_16le"
58
59 static size_t
utf_16_close(void * data)60 utf_16_close (
61 void *data)
62 {
63 free(data);
64 return 0;
65 }
66
67 #if defined (ICONV_FROM_UCS_CES_UTF_16)
68 static void *
utf_16_init_from_ucs(const char * encoding)69 utf_16_init_from_ucs (
70 const char *encoding)
71 {
72 int *data;
73
74 if ((data = (int *)malloc (sizeof (int))) == NULL)
75 return (void *)NULL;
76
77 if (strcmp (encoding, UTF_16LE) == 0)
78 *data = UTF16_LITTLE_ENDIAN;
79 else if (strcmp (encoding, UTF_16BE) == 0)
80 *data = UTF16_BIG_ENDIAN;
81 else
82 *data = UTF16_SYSTEM_ENDIAN;
83
84 return (void *)data;
85 }
86
87 static size_t
utf_16_convert_from_ucs(void * data,register ucs4_t in,unsigned char ** outbuf,size_t * outbytesleft)88 utf_16_convert_from_ucs (void *data,
89 register ucs4_t in,
90 unsigned char **outbuf,
91 size_t *outbytesleft)
92 {
93 register ucs2_t *cp;
94 register size_t bytes;
95 register int *state;
96
97 if (in > 0x0010FFFF || (in >= 0x0000D800 && in <= 0x0000DFFF)
98 || in == 0x0000FFFF || in == 0x0000FFFE)
99 return (size_t)ICONV_CES_INVALID_CHARACTER;
100
101 state = (int *)data;
102 bytes = (*state == UTF16_SYSTEM_ENDIAN) ? sizeof (ucs2_t) * 2
103 : sizeof (ucs2_t);
104
105 if (in > 0x0000FFFF)
106 bytes += sizeof (ucs2_t);
107
108 if (*outbytesleft < bytes)
109 return (size_t)ICONV_CES_NOSPACE;
110
111 cp = (ucs2_t *)*outbuf;
112
113 if (*state == UTF16_SYSTEM_ENDIAN)
114 {
115 *cp++ = UTF16_BOM;
116 *state |= UTF16_BOM_WRITTEN;
117 }
118
119 if (in < 0x00010000)
120 {
121 switch (*state)
122 {
123 case UTF16_LITTLE_ENDIAN:
124 *cp = ICONV_HTOLES ((ucs2_t)in);
125 break;
126 case UTF16_BIG_ENDIAN:
127 *cp = ICONV_HTOBES ((ucs2_t)in);
128 break;
129 case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
130 *cp = (ucs2_t)in;
131 break;
132 }
133 }
134 else
135 {
136 ucs2_t w1, w2;
137
138 /* Process surrogate pair */
139 in -= 0x00010000;
140 w1 = ((ucs2_t)((in >> 10)) & 0x03FF) | 0xD800;
141 w2 = (ucs2_t)(in & 0x000003FF) | 0xDC00;
142
143 switch (*state)
144 {
145 case UTF16_LITTLE_ENDIAN:
146 *cp++ = ICONV_HTOLES (w1);
147 *cp = ICONV_HTOLES (w2);
148 break;
149 case UTF16_BIG_ENDIAN:
150 *cp++ = ICONV_HTOBES (w1);
151 *cp = ICONV_HTOBES (w2);
152 break;
153 case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
154 *cp++ = w1;
155 *cp = w2;
156 break;
157 }
158 }
159
160 *outbuf += bytes;
161 *outbytesleft -= bytes;
162
163 return bytes;
164 }
165 #endif /* ICONV_FROM_UCS_CES_UTF_16 */
166
167 #if defined (ICONV_TO_UCS_CES_UTF_16)
168 static void *
utf_16_init_to_ucs(const char * encoding)169 utf_16_init_to_ucs (
170 const char *encoding)
171 {
172 int *data;
173
174 if ((data = (int *)malloc (sizeof (int))) == NULL)
175 return (void *)NULL;
176
177 if (strcmp (encoding, UTF_16BE) == 0)
178 *data = UTF16_BIG_ENDIAN;
179 else if (strcmp (encoding, UTF_16LE) == 0)
180 *data = UTF16_LITTLE_ENDIAN;
181 else
182 *data = UTF16_UNDEFINED;
183
184 return (void *)data;
185 }
186
187 static ucs4_t
utf_16_convert_to_ucs(void * data,const unsigned char ** inbuf,size_t * inbytesleft)188 utf_16_convert_to_ucs (void *data,
189 const unsigned char **inbuf,
190 size_t *inbytesleft)
191 {
192 register ucs2_t w1;
193 register ucs2_t w2;
194 register ucs2_t *cp;
195 int *state;
196 ucs4_t res;
197 size_t bytes = sizeof (ucs2_t);
198
199 (void) data;
200 if (*inbytesleft < bytes)
201 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
202
203 state = (int *)data;
204 cp = ((ucs2_t *)*inbuf);
205
206 if (*state == UTF16_UNDEFINED)
207 {
208 if (*cp == ICONV_HTOLES(UTF16_BOM))
209 *state = UTF16_LITTLE_ENDIAN;
210 else
211 *state = UTF16_BIG_ENDIAN;
212
213 if ( *cp == ICONV_HTOBES (UTF16_BOM)
214 || *cp == ICONV_HTOLES (UTF16_BOM))
215 {
216 if (*inbytesleft < (bytes += sizeof (ucs2_t)))
217 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
218 cp += 1;
219 }
220 }
221
222 if (*state == UTF16_LITTLE_ENDIAN)
223 w1 = ICONV_LETOHS (*cp);
224 else
225 w1 = ICONV_BETOHS (*cp);
226
227 if (w1 < 0xD800 || w1 > 0xDFFF)
228 {
229 if (w1 == 0xFFFF || w1 == 0xFFFE)
230 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
231 res = (ucs4_t)w1;
232 }
233 else
234 {
235 /* Process surrogate pair */
236 if (*inbytesleft < (bytes += 2))
237 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
238
239 if (w1 > 0xDBFF)
240 /* Broken surrogate character */
241 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
242
243 cp += 1;
244
245 if (*state == UTF16_LITTLE_ENDIAN)
246 w2 = ICONV_LETOHS (*cp);
247 else
248 w2 = ICONV_BETOHS (*cp);
249
250 if (w2 < 0xDC00 || w2 > 0xDFFF)
251 /* Broken surrogate character */
252 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
253
254 res = (ucs4_t)(w2 & 0x03FF) | ((ucs4_t)(w1 & 0x03FF) << 10);
255 res += 0x00010000;
256 }
257
258 *inbuf += bytes;
259 *inbytesleft -= bytes;
260
261 return res;
262 }
263 #endif /* ICONV_TO_UCS_CES_UTF_16 */
264
265 static int
utf_16_get_mb_cur_max(void * data)266 utf_16_get_mb_cur_max (void *data)
267 {
268 (void) data;
269 return 6;
270 }
271
272 #if defined (ICONV_TO_UCS_CES_UTF_16)
273 const iconv_to_ucs_ces_handlers_t
274 _iconv_to_ucs_ces_handlers_utf_16 =
275 {
276 utf_16_init_to_ucs,
277 utf_16_close,
278 utf_16_get_mb_cur_max,
279 NULL,
280 NULL,
281 NULL,
282 utf_16_convert_to_ucs
283 };
284 #endif
285
286 #if defined (ICONV_FROM_UCS_CES_UTF_16)
287 const iconv_from_ucs_ces_handlers_t
288 _iconv_from_ucs_ces_handlers_utf_16 =
289 {
290 utf_16_init_from_ucs,
291 utf_16_close,
292 utf_16_get_mb_cur_max,
293 NULL,
294 NULL,
295 NULL,
296 utf_16_convert_from_ucs
297 };
298 #endif
299
300 #endif /* ICONV_TO_UCS_CES_UTF_16 || ICONV_FROM_UCS_CES_UTF_16 */
301
302