1 /*
2 * Copyright (c) 2003-2004, Artem B. Bityuckiy
3 * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 #include "cesbi.h"
27
28 #if defined (ICONV_TO_UCS_CES_UTF_16) \
29 || defined (ICONV_FROM_UCS_CES_UTF_16)
30
31 #include <sys/types.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "../lib/local.h"
36 #include "../lib/ucsconv.h"
37 #include "../lib/endian.h"
38
39 /*
40 * On input UTF-16 converter interpret BOM and uses Big Endian byte order if BOM
41 * is absent. UTF-16 converter outputs in System Endian and adds correspondent
42 * BOM as first code. UTF-16LE and UTF-16BE converters ignore BOM on input and
43 * don't output BOM.
44 */
45
46 #define UTF16_UNDEFINED 0x00
47 #define UTF16_BIG_ENDIAN 0x01
48 #define UTF16_LITTLE_ENDIAN 0x02
49 #define UTF16_SYSTEM_ENDIAN 0x04
50 #define UTF16_BOM_WRITTEN 0x08
51
52 #define UTF16_BOM 0xFEFF
53
54 #define UTF_16 "utf_16"
55 #define UTF_16BE "utf_16be"
56 #define UTF_16LE "utf_16le"
57
58 static size_t
utf_16_close(void * data)59 utf_16_close (
60 void *data)
61 {
62 free(data);
63 return 0;
64 }
65
66 #if defined (ICONV_FROM_UCS_CES_UTF_16)
67 static void *
utf_16_init_from_ucs(const char * encoding)68 utf_16_init_from_ucs (
69 const char *encoding)
70 {
71 int *data;
72
73 if ((data = (int *)malloc (sizeof (int))) == NULL)
74 return (void *)NULL;
75
76 if (strcmp (encoding, UTF_16LE) == 0)
77 *data = UTF16_LITTLE_ENDIAN;
78 else if (strcmp (encoding, UTF_16BE) == 0)
79 *data = UTF16_BIG_ENDIAN;
80 else
81 *data = UTF16_SYSTEM_ENDIAN;
82
83 return (void *)data;
84 }
85
86 static size_t
utf_16_convert_from_ucs(void * data,register ucs4_t in,unsigned char ** outbuf,size_t * outbytesleft)87 utf_16_convert_from_ucs (void *data,
88 register ucs4_t in,
89 unsigned char **outbuf,
90 size_t *outbytesleft)
91 {
92 register ucs2_t *cp;
93 register size_t bytes;
94 register int *state;
95
96 if (in > 0x0010FFFF || (in >= 0x0000D800 && in <= 0x0000DFFF)
97 || in == 0x0000FFFF || in == 0x0000FFFE)
98 return (size_t)ICONV_CES_INVALID_CHARACTER;
99
100 state = (int *)data;
101 bytes = (*state == UTF16_SYSTEM_ENDIAN) ? sizeof (ucs2_t) * 2
102 : sizeof (ucs2_t);
103
104 if (in > 0x0000FFFF)
105 bytes += sizeof (ucs2_t);
106
107 if (*outbytesleft < bytes)
108 return (size_t)ICONV_CES_NOSPACE;
109
110 cp = (ucs2_t *)*outbuf;
111
112 if (*state == UTF16_SYSTEM_ENDIAN)
113 {
114 *cp++ = UTF16_BOM;
115 *state |= UTF16_BOM_WRITTEN;
116 }
117
118 if (in < 0x00010000)
119 {
120 switch (*state)
121 {
122 case UTF16_LITTLE_ENDIAN:
123 *cp = ICONV_HTOLES ((ucs2_t)in);
124 break;
125 case UTF16_BIG_ENDIAN:
126 *cp = ICONV_HTOBES ((ucs2_t)in);
127 break;
128 case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
129 *cp = (ucs2_t)in;
130 break;
131 }
132 }
133 else
134 {
135 ucs2_t w1, w2;
136
137 /* Process surrogate pair */
138 in -= 0x00010000;
139 w1 = ((ucs2_t)((in >> 10)) & 0x03FF) | 0xD800;
140 w2 = (ucs2_t)(in & 0x000003FF) | 0xDC00;
141
142 switch (*state)
143 {
144 case UTF16_LITTLE_ENDIAN:
145 *cp++ = ICONV_HTOLES (w1);
146 *cp = ICONV_HTOLES (w2);
147 break;
148 case UTF16_BIG_ENDIAN:
149 *cp++ = ICONV_HTOBES (w1);
150 *cp = ICONV_HTOBES (w2);
151 break;
152 case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
153 *cp++ = w1;
154 *cp = w2;
155 break;
156 }
157 }
158
159 *outbuf += bytes;
160 *outbytesleft -= bytes;
161
162 return bytes;
163 }
164 #endif /* ICONV_FROM_UCS_CES_UTF_16 */
165
166 #if defined (ICONV_TO_UCS_CES_UTF_16)
167 static void *
utf_16_init_to_ucs(const char * encoding)168 utf_16_init_to_ucs (
169 const char *encoding)
170 {
171 int *data;
172
173 if ((data = (int *)malloc (sizeof (int))) == NULL)
174 return (void *)NULL;
175
176 if (strcmp (encoding, UTF_16BE) == 0)
177 *data = UTF16_BIG_ENDIAN;
178 else if (strcmp (encoding, UTF_16LE) == 0)
179 *data = UTF16_LITTLE_ENDIAN;
180 else
181 *data = UTF16_UNDEFINED;
182
183 return (void *)data;
184 }
185
186 static ucs4_t
utf_16_convert_to_ucs(void * data,const unsigned char ** inbuf,size_t * inbytesleft)187 utf_16_convert_to_ucs (void *data,
188 const unsigned char **inbuf,
189 size_t *inbytesleft)
190 {
191 register ucs2_t w1;
192 register ucs2_t w2;
193 register ucs2_t *cp;
194 int *state;
195 ucs4_t res;
196 size_t bytes = sizeof (ucs2_t);
197
198 (void) data;
199 if (*inbytesleft < bytes)
200 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
201
202 state = (int *)data;
203 cp = ((ucs2_t *)*inbuf);
204
205 if (*state == UTF16_UNDEFINED)
206 {
207 if (*cp == ICONV_HTOLES(UTF16_BOM))
208 *state = UTF16_LITTLE_ENDIAN;
209 else
210 *state = UTF16_BIG_ENDIAN;
211
212 if ( *cp == ICONV_HTOBES (UTF16_BOM)
213 || *cp == ICONV_HTOLES (UTF16_BOM))
214 {
215 if (*inbytesleft < (bytes += sizeof (ucs2_t)))
216 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
217 cp += 1;
218 }
219 }
220
221 if (*state == UTF16_LITTLE_ENDIAN)
222 w1 = ICONV_LETOHS (*cp);
223 else
224 w1 = ICONV_BETOHS (*cp);
225
226 if (w1 < 0xD800 || w1 > 0xDFFF)
227 {
228 if (w1 == 0xFFFF || w1 == 0xFFFE)
229 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
230 res = (ucs4_t)w1;
231 }
232 else
233 {
234 /* Process surrogate pair */
235 if (*inbytesleft < (bytes += 2))
236 return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
237
238 if (w1 > 0xDBFF)
239 /* Broken surrogate character */
240 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
241
242 cp += 1;
243
244 if (*state == UTF16_LITTLE_ENDIAN)
245 w2 = ICONV_LETOHS (*cp);
246 else
247 w2 = ICONV_BETOHS (*cp);
248
249 if (w2 < 0xDC00 || w2 > 0xDFFF)
250 /* Broken surrogate character */
251 return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
252
253 res = (ucs4_t)(w2 & 0x03FF) | ((ucs4_t)(w1 & 0x03FF) << 10);
254 res += 0x00010000;
255 }
256
257 *inbuf += bytes;
258 *inbytesleft -= bytes;
259
260 return res;
261 }
262 #endif /* ICONV_TO_UCS_CES_UTF_16 */
263
264 static int
utf_16_get_mb_cur_max(void * data)265 utf_16_get_mb_cur_max (void *data)
266 {
267 (void) data;
268 return 6;
269 }
270
271 #if defined (ICONV_TO_UCS_CES_UTF_16)
272 const iconv_to_ucs_ces_handlers_t
273 _iconv_to_ucs_ces_handlers_utf_16 =
274 {
275 utf_16_init_to_ucs,
276 utf_16_close,
277 utf_16_get_mb_cur_max,
278 NULL,
279 NULL,
280 NULL,
281 utf_16_convert_to_ucs
282 };
283 #endif
284
285 #if defined (ICONV_FROM_UCS_CES_UTF_16)
286 const iconv_from_ucs_ces_handlers_t
287 _iconv_from_ucs_ces_handlers_utf_16 =
288 {
289 utf_16_init_from_ucs,
290 utf_16_close,
291 utf_16_get_mb_cur_max,
292 NULL,
293 NULL,
294 NULL,
295 utf_16_convert_from_ucs
296 };
297 #endif
298
299 #endif /* ICONV_TO_UCS_CES_UTF_16 || ICONV_FROM_UCS_CES_UTF_16 */
300
301