1 /*
2  * Copyright (c) 2003-2004, Artem B. Bityuckiy
3  * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 #include "cesbi.h"
27 
28 #if defined (ICONV_TO_UCS_CES_UTF_16) \
29  || defined (ICONV_FROM_UCS_CES_UTF_16)
30 
31 #include <sys/types.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "../lib/local.h"
36 #include "../lib/ucsconv.h"
37 #include "../lib/endian.h"
38 
39 /*
40  * On input UTF-16 converter interpret BOM and uses Big Endian byte order if BOM
41  * is absent. UTF-16 converter outputs in System Endian and adds correspondent
42  * BOM as first code. UTF-16LE and UTF-16BE converters ignore BOM on input and
43  * don't output BOM.
44  */
45 
46 #define UTF16_UNDEFINED     0x00
47 #define UTF16_BIG_ENDIAN    0x01
48 #define UTF16_LITTLE_ENDIAN 0x02
49 #define UTF16_SYSTEM_ENDIAN 0x04
50 #define UTF16_BOM_WRITTEN   0x08
51 
52 #define UTF16_BOM 0xFEFF
53 
54 #define UTF_16   "utf_16"
55 #define UTF_16BE "utf_16be"
56 #define UTF_16LE "utf_16le"
57 
58 static size_t
utf_16_close(void * data)59 utf_16_close (
60                      void *data)
61 {
62   free(data);
63   return 0;
64 }
65 
66 #if defined (ICONV_FROM_UCS_CES_UTF_16)
67 static void *
utf_16_init_from_ucs(const char * encoding)68 utf_16_init_from_ucs (
69                              const char *encoding)
70 {
71   int *data;
72 
73   if ((data = (int *)malloc (sizeof (int))) == NULL)
74     return (void *)NULL;
75 
76   if (strcmp (encoding, UTF_16LE) == 0)
77     *data = UTF16_LITTLE_ENDIAN;
78   else if (strcmp (encoding, UTF_16BE) == 0)
79     *data = UTF16_BIG_ENDIAN;
80   else
81     *data = UTF16_SYSTEM_ENDIAN;
82 
83   return (void *)data;
84 }
85 
86 static size_t
utf_16_convert_from_ucs(void * data,register ucs4_t in,unsigned char ** outbuf,size_t * outbytesleft)87 utf_16_convert_from_ucs (void *data,
88                                 register ucs4_t in,
89                                 unsigned char **outbuf,
90                                 size_t *outbytesleft)
91 {
92   register ucs2_t *cp;
93   register size_t bytes;
94   register int *state;
95 
96   if (in > 0x0010FFFF || (in >= 0x0000D800 && in <= 0x0000DFFF)
97       || in == 0x0000FFFF || in == 0x0000FFFE)
98     return (size_t)ICONV_CES_INVALID_CHARACTER;
99 
100   state = (int *)data;
101   bytes = (*state == UTF16_SYSTEM_ENDIAN) ? sizeof (ucs2_t) * 2
102                                           : sizeof (ucs2_t);
103 
104   if (in > 0x0000FFFF)
105     bytes += sizeof (ucs2_t);
106 
107   if (*outbytesleft < bytes)
108     return (size_t)ICONV_CES_NOSPACE;
109 
110   cp = (ucs2_t *)*outbuf;
111 
112   if (*state == UTF16_SYSTEM_ENDIAN)
113     {
114       *cp++ = UTF16_BOM;
115       *state |= UTF16_BOM_WRITTEN;
116     }
117 
118   if (in < 0x00010000)
119     {
120       switch (*state)
121         {
122           case UTF16_LITTLE_ENDIAN:
123             *cp = ICONV_HTOLES ((ucs2_t)in);
124             break;
125           case UTF16_BIG_ENDIAN:
126             *cp = ICONV_HTOBES ((ucs2_t)in);
127             break;
128           case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
129             *cp = (ucs2_t)in;
130             break;
131         }
132     }
133   else
134     {
135       ucs2_t w1, w2;
136 
137       /* Process surrogate pair */
138       in -= 0x00010000;
139       w1 = ((ucs2_t)((in >> 10)) & 0x03FF) | 0xD800;
140       w2 = (ucs2_t)(in & 0x000003FF) | 0xDC00;
141 
142       switch (*state)
143         {
144           case UTF16_LITTLE_ENDIAN:
145             *cp++ = ICONV_HTOLES (w1);
146             *cp = ICONV_HTOLES (w2);
147             break;
148           case UTF16_BIG_ENDIAN:
149             *cp++ = ICONV_HTOBES (w1);
150             *cp = ICONV_HTOBES (w2);
151             break;
152           case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
153             *cp++ = w1;
154             *cp = w2;
155             break;
156         }
157     }
158 
159   *outbuf += bytes;
160   *outbytesleft -= bytes;
161 
162   return bytes;
163 }
164 #endif /* ICONV_FROM_UCS_CES_UTF_16 */
165 
166 #if defined (ICONV_TO_UCS_CES_UTF_16)
167 static void *
utf_16_init_to_ucs(const char * encoding)168 utf_16_init_to_ucs (
169                            const char *encoding)
170 {
171   int *data;
172 
173   if ((data = (int *)malloc (sizeof (int))) == NULL)
174     return (void *)NULL;
175 
176   if (strcmp (encoding, UTF_16BE) == 0)
177     *data = UTF16_BIG_ENDIAN;
178   else if (strcmp (encoding, UTF_16LE) == 0)
179     *data = UTF16_LITTLE_ENDIAN;
180   else
181     *data = UTF16_UNDEFINED;
182 
183   return (void *)data;
184 }
185 
186 static ucs4_t
utf_16_convert_to_ucs(void * data,const unsigned char ** inbuf,size_t * inbytesleft)187 utf_16_convert_to_ucs (void *data,
188                               const unsigned char **inbuf,
189                               size_t *inbytesleft)
190 {
191   register ucs2_t w1;
192   register ucs2_t w2;
193   register ucs2_t *cp;
194   int *state;
195   ucs4_t res;
196   size_t bytes = sizeof (ucs2_t);
197 
198   (void) data;
199   if (*inbytesleft < bytes)
200     return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
201 
202   state = (int *)data;
203   cp = ((ucs2_t *)*inbuf);
204 
205   if (*state == UTF16_UNDEFINED)
206     {
207       if (*cp == ICONV_HTOLES(UTF16_BOM))
208         *state = UTF16_LITTLE_ENDIAN;
209       else
210         *state = UTF16_BIG_ENDIAN;
211 
212      if (   *cp == ICONV_HTOBES (UTF16_BOM)
213          || *cp == ICONV_HTOLES (UTF16_BOM))
214        {
215          if (*inbytesleft < (bytes += sizeof (ucs2_t)))
216            return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
217          cp += 1;
218        }
219     }
220 
221   if (*state == UTF16_LITTLE_ENDIAN)
222     w1 = ICONV_LETOHS (*cp);
223   else
224     w1 = ICONV_BETOHS (*cp);
225 
226   if (w1  < 0xD800 || w1 > 0xDFFF)
227     {
228       if (w1 == 0xFFFF || w1 == 0xFFFE)
229         return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
230       res = (ucs4_t)w1;
231     }
232   else
233     {
234       /* Process surrogate pair */
235       if (*inbytesleft < (bytes += 2))
236         return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
237 
238       if (w1 > 0xDBFF)
239         /* Broken surrogate character */
240         return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
241 
242       cp += 1;
243 
244       if (*state == UTF16_LITTLE_ENDIAN)
245         w2 = ICONV_LETOHS (*cp);
246       else
247         w2 = ICONV_BETOHS (*cp);
248 
249       if (w2 < 0xDC00 || w2 > 0xDFFF)
250         /* Broken surrogate character */
251         return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
252 
253       res = (ucs4_t)(w2 & 0x03FF) | ((ucs4_t)(w1 & 0x03FF) << 10);
254       res += 0x00010000;
255     }
256 
257   *inbuf += bytes;
258   *inbytesleft -= bytes;
259 
260   return res;
261 }
262 #endif /* ICONV_TO_UCS_CES_UTF_16 */
263 
264 static int
utf_16_get_mb_cur_max(void * data)265 utf_16_get_mb_cur_max (void *data)
266 {
267   (void) data;
268   return 6;
269 }
270 
271 #if defined (ICONV_TO_UCS_CES_UTF_16)
272 const iconv_to_ucs_ces_handlers_t
273 _iconv_to_ucs_ces_handlers_utf_16 =
274 {
275   utf_16_init_to_ucs,
276   utf_16_close,
277   utf_16_get_mb_cur_max,
278   NULL,
279   NULL,
280   NULL,
281   utf_16_convert_to_ucs
282 };
283 #endif
284 
285 #if defined (ICONV_FROM_UCS_CES_UTF_16)
286 const iconv_from_ucs_ces_handlers_t
287 _iconv_from_ucs_ces_handlers_utf_16 =
288 {
289   utf_16_init_from_ucs,
290   utf_16_close,
291   utf_16_get_mb_cur_max,
292   NULL,
293   NULL,
294   NULL,
295   utf_16_convert_from_ucs
296 };
297 #endif
298 
299 #endif /* ICONV_TO_UCS_CES_UTF_16 || ICONV_FROM_UCS_CES_UTF_16 */
300 
301