1 /*
2  * Copyright (c) 2003-2004, Artem B. Bityuckiy
3  * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 #include "cesbi.h"
27 
28 #if defined (ICONV_TO_UCS_CES_UTF_16) \
29  || defined (ICONV_FROM_UCS_CES_UTF_16)
30 
31 #include <_ansi.h>
32 #include <sys/types.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include "../lib/local.h"
37 #include "../lib/ucsconv.h"
38 #include "../lib/endian.h"
39 
40 /*
41  * On input UTF-16 converter interpret BOM and uses Big Endian byte order if BOM
42  * is absent. UTF-16 converter outputs in System Endian and adds correspondent
43  * BOM as first code. UTF-16LE and UTF-16BE converters ignore BOM on input and
44  * don't output BOM.
45  */
46 
47 #define UTF16_UNDEFINED     0x00
48 #define UTF16_BIG_ENDIAN    0x01
49 #define UTF16_LITTLE_ENDIAN 0x02
50 #define UTF16_SYSTEM_ENDIAN 0x04
51 #define UTF16_BOM_WRITTEN   0x08
52 
53 #define UTF16_BOM 0xFEFF
54 
55 #define UTF_16   "utf_16"
56 #define UTF_16BE "utf_16be"
57 #define UTF_16LE "utf_16le"
58 
59 static size_t
utf_16_close(void * data)60 utf_16_close (
61                      void *data)
62 {
63   free(data);
64   return 0;
65 }
66 
67 #if defined (ICONV_FROM_UCS_CES_UTF_16)
68 static void *
utf_16_init_from_ucs(const char * encoding)69 utf_16_init_from_ucs (
70                              const char *encoding)
71 {
72   int *data;
73 
74   if ((data = (int *)malloc (sizeof (int))) == NULL)
75     return (void *)NULL;
76 
77   if (strcmp (encoding, UTF_16LE) == 0)
78     *data = UTF16_LITTLE_ENDIAN;
79   else if (strcmp (encoding, UTF_16BE) == 0)
80     *data = UTF16_BIG_ENDIAN;
81   else
82     *data = UTF16_SYSTEM_ENDIAN;
83 
84   return (void *)data;
85 }
86 
87 static size_t
utf_16_convert_from_ucs(void * data,register ucs4_t in,unsigned char ** outbuf,size_t * outbytesleft)88 utf_16_convert_from_ucs (void *data,
89                                 register ucs4_t in,
90                                 unsigned char **outbuf,
91                                 size_t *outbytesleft)
92 {
93   register ucs2_t *cp;
94   register size_t bytes;
95   register int *state;
96 
97   if (in > 0x0010FFFF || (in >= 0x0000D800 && in <= 0x0000DFFF)
98       || in == 0x0000FFFF || in == 0x0000FFFE)
99     return (size_t)ICONV_CES_INVALID_CHARACTER;
100 
101   state = (int *)data;
102   bytes = (*state == UTF16_SYSTEM_ENDIAN) ? sizeof (ucs2_t) * 2
103                                           : sizeof (ucs2_t);
104 
105   if (in > 0x0000FFFF)
106     bytes += sizeof (ucs2_t);
107 
108   if (*outbytesleft < bytes)
109     return (size_t)ICONV_CES_NOSPACE;
110 
111   cp = (ucs2_t *)*outbuf;
112 
113   if (*state == UTF16_SYSTEM_ENDIAN)
114     {
115       *cp++ = UTF16_BOM;
116       *state |= UTF16_BOM_WRITTEN;
117     }
118 
119   if (in < 0x00010000)
120     {
121       switch (*state)
122         {
123           case UTF16_LITTLE_ENDIAN:
124             *cp = ICONV_HTOLES ((ucs2_t)in);
125             break;
126           case UTF16_BIG_ENDIAN:
127             *cp = ICONV_HTOBES ((ucs2_t)in);
128             break;
129           case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
130             *cp = (ucs2_t)in;
131             break;
132         }
133     }
134   else
135     {
136       ucs2_t w1, w2;
137 
138       /* Process surrogate pair */
139       in -= 0x00010000;
140       w1 = ((ucs2_t)((in >> 10)) & 0x03FF) | 0xD800;
141       w2 = (ucs2_t)(in & 0x000003FF) | 0xDC00;
142 
143       switch (*state)
144         {
145           case UTF16_LITTLE_ENDIAN:
146             *cp++ = ICONV_HTOLES (w1);
147             *cp = ICONV_HTOLES (w2);
148             break;
149           case UTF16_BIG_ENDIAN:
150             *cp++ = ICONV_HTOBES (w1);
151             *cp = ICONV_HTOBES (w2);
152             break;
153           case (UTF16_SYSTEM_ENDIAN | UTF16_BOM_WRITTEN):
154             *cp++ = w1;
155             *cp = w2;
156             break;
157         }
158     }
159 
160   *outbuf += bytes;
161   *outbytesleft -= bytes;
162 
163   return bytes;
164 }
165 #endif /* ICONV_FROM_UCS_CES_UTF_16 */
166 
167 #if defined (ICONV_TO_UCS_CES_UTF_16)
168 static void *
utf_16_init_to_ucs(const char * encoding)169 utf_16_init_to_ucs (
170                            const char *encoding)
171 {
172   int *data;
173 
174   if ((data = (int *)malloc (sizeof (int))) == NULL)
175     return (void *)NULL;
176 
177   if (strcmp (encoding, UTF_16BE) == 0)
178     *data = UTF16_BIG_ENDIAN;
179   else if (strcmp (encoding, UTF_16LE) == 0)
180     *data = UTF16_LITTLE_ENDIAN;
181   else
182     *data = UTF16_UNDEFINED;
183 
184   return (void *)data;
185 }
186 
187 static ucs4_t
utf_16_convert_to_ucs(void * data,const unsigned char ** inbuf,size_t * inbytesleft)188 utf_16_convert_to_ucs (void *data,
189                               const unsigned char **inbuf,
190                               size_t *inbytesleft)
191 {
192   register ucs2_t w1;
193   register ucs2_t w2;
194   register ucs2_t *cp;
195   int *state;
196   ucs4_t res;
197   size_t bytes = sizeof (ucs2_t);
198 
199   (void) data;
200   if (*inbytesleft < bytes)
201     return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
202 
203   state = (int *)data;
204   cp = ((ucs2_t *)*inbuf);
205 
206   if (*state == UTF16_UNDEFINED)
207     {
208       if (*cp == ICONV_HTOLES(UTF16_BOM))
209         *state = UTF16_LITTLE_ENDIAN;
210       else
211         *state = UTF16_BIG_ENDIAN;
212 
213      if (   *cp == ICONV_HTOBES (UTF16_BOM)
214          || *cp == ICONV_HTOLES (UTF16_BOM))
215        {
216          if (*inbytesleft < (bytes += sizeof (ucs2_t)))
217            return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
218          cp += 1;
219        }
220     }
221 
222   if (*state == UTF16_LITTLE_ENDIAN)
223     w1 = ICONV_LETOHS (*cp);
224   else
225     w1 = ICONV_BETOHS (*cp);
226 
227   if (w1  < 0xD800 || w1 > 0xDFFF)
228     {
229       if (w1 == 0xFFFF || w1 == 0xFFFE)
230         return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
231       res = (ucs4_t)w1;
232     }
233   else
234     {
235       /* Process surrogate pair */
236       if (*inbytesleft < (bytes += 2))
237         return (ucs4_t)ICONV_CES_BAD_SEQUENCE;
238 
239       if (w1 > 0xDBFF)
240         /* Broken surrogate character */
241         return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
242 
243       cp += 1;
244 
245       if (*state == UTF16_LITTLE_ENDIAN)
246         w2 = ICONV_LETOHS (*cp);
247       else
248         w2 = ICONV_BETOHS (*cp);
249 
250       if (w2 < 0xDC00 || w2 > 0xDFFF)
251         /* Broken surrogate character */
252         return (ucs4_t)ICONV_CES_INVALID_CHARACTER;
253 
254       res = (ucs4_t)(w2 & 0x03FF) | ((ucs4_t)(w1 & 0x03FF) << 10);
255       res += 0x00010000;
256     }
257 
258   *inbuf += bytes;
259   *inbytesleft -= bytes;
260 
261   return res;
262 }
263 #endif /* ICONV_TO_UCS_CES_UTF_16 */
264 
265 static int
utf_16_get_mb_cur_max(void * data)266 utf_16_get_mb_cur_max (void *data)
267 {
268   (void) data;
269   return 6;
270 }
271 
272 #if defined (ICONV_TO_UCS_CES_UTF_16)
273 const iconv_to_ucs_ces_handlers_t
274 _iconv_to_ucs_ces_handlers_utf_16 =
275 {
276   utf_16_init_to_ucs,
277   utf_16_close,
278   utf_16_get_mb_cur_max,
279   NULL,
280   NULL,
281   NULL,
282   utf_16_convert_to_ucs
283 };
284 #endif
285 
286 #if defined (ICONV_FROM_UCS_CES_UTF_16)
287 const iconv_from_ucs_ces_handlers_t
288 _iconv_from_ucs_ces_handlers_utf_16 =
289 {
290   utf_16_init_from_ucs,
291   utf_16_close,
292   utf_16_get_mb_cur_max,
293   NULL,
294   NULL,
295   NULL,
296   utf_16_convert_from_ucs
297 };
298 #endif
299 
300 #endif /* ICONV_TO_UCS_CES_UTF_16 || ICONV_FROM_UCS_CES_UTF_16 */
301 
302