1 /*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net>
13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14 Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15 Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com>
17 Copyright (c) 2016 Don Lewis <truckman@apache.org>
18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net>
20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com>
21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com>
22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
23 Copyright (c) 2021 Donghee Na <donghee.na@python.org>
24 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com>
25 Copyright (c) 2022 Sean McBride <sean@rogue-research.com>
26 Copyright (c) 2023 Hanno Böck <hanno@gentoo.org>
27 Licensed under the MIT license:
28
29 Permission is hereby granted, free of charge, to any person obtaining
30 a copy of this software and associated documentation files (the
31 "Software"), to deal in the Software without restriction, including
32 without limitation the rights to use, copy, modify, merge, publish,
33 distribute, sublicense, and/or sell copies of the Software, and to permit
34 persons to whom the Software is furnished to do so, subject to the
35 following conditions:
36
37 The above copyright notice and this permission notice shall be included
38 in all copies or substantial portions of the Software.
39
40 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
41 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
43 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
44 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
45 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
46 USE OR OTHER DEALINGS IN THE SOFTWARE.
47 */
48
49 #include "../../lv_conf_internal.h"
50 #if LV_USE_XML
51
52 #include "expat_config.h"
53
54 #include <stddef.h>
55 #include <string.h> /* memcpy */
56 #include <stdbool.h>
57
58 #ifdef _WIN32
59 # include "winconfig.h"
60 #endif
61
62 #include "expat_external.h"
63 #include "internal.h"
64 #include "xmltok.h"
65 #include "nametab.h"
66
67 #ifdef XML_DTD
68 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
69 #else
70 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */
71 #endif
72
73 #define VTABLE1 \
74 {PREFIX(prologTok), PREFIX(contentTok), \
75 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
76 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
77 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
78 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
79 PREFIX(updatePosition), PREFIX(isPublicId)
80
81 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
82
83 #define UCS2_GET_NAMING(pages, hi, lo) \
84 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
85
86 /* A 2 byte UTF-8 representation splits the characters 11 bits between
87 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
88 pages, 3 bits to add to that index and 5 bits to generate the mask.
89 */
90 #define UTF8_GET_NAMING2(pages, byte) \
91 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
92 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
93 & (1u << (((byte)[1]) & 0x1F)))
94
95 /* A 3 byte UTF-8 representation splits the characters 16 bits between
96 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
97 into pages, 3 bits to add to that index and 5 bits to generate the
98 mask.
99 */
100 #define UTF8_GET_NAMING3(pages, byte) \
101 (namingBitmap \
102 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
103 << 3) \
104 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
105 & (1u << (((byte)[2]) & 0x1F)))
106
107 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
108 of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
109 with the additional restriction of not allowing the Unicode
110 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
111 Implementation details:
112 (A & 0x80) == 0 means A < 0x80
113 and
114 (A & 0xC0) == 0xC0 means A > 0xBF
115 */
116
117 #define UTF8_INVALID2(p) \
118 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
119
120 #define UTF8_INVALID3(p) \
121 (((p)[2] & 0x80) == 0 \
122 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
123 : ((p)[2] & 0xC0) == 0xC0) \
124 || ((*p) == 0xE0 \
125 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
126 : ((p)[1] & 0x80) == 0 \
127 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
128
129 #define UTF8_INVALID4(p) \
130 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
131 || ((p)[2] & 0xC0) == 0xC0 \
132 || ((*p) == 0xF0 \
133 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
134 : ((p)[1] & 0x80) == 0 \
135 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
136
137 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)138 isNever(const ENCODING *enc, const char *p) {
139 UNUSED_P(enc);
140 UNUSED_P(p);
141 return 0;
142 }
143
144 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)145 utf8_isName2(const ENCODING *enc, const char *p) {
146 UNUSED_P(enc);
147 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
148 }
149
150 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)151 utf8_isName3(const ENCODING *enc, const char *p) {
152 UNUSED_P(enc);
153 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
154 }
155
156 #define utf8_isName4 isNever
157
158 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)159 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
160 UNUSED_P(enc);
161 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
162 }
163
164 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)165 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
166 UNUSED_P(enc);
167 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
168 }
169
170 #define utf8_isNmstrt4 isNever
171
172 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)173 utf8_isInvalid2(const ENCODING *enc, const char *p) {
174 UNUSED_P(enc);
175 return UTF8_INVALID2((const unsigned char *)p);
176 }
177
178 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)179 utf8_isInvalid3(const ENCODING *enc, const char *p) {
180 UNUSED_P(enc);
181 return UTF8_INVALID3((const unsigned char *)p);
182 }
183
184 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)185 utf8_isInvalid4(const ENCODING *enc, const char *p) {
186 UNUSED_P(enc);
187 return UTF8_INVALID4((const unsigned char *)p);
188 }
189
190 struct normal_encoding {
191 ENCODING enc;
192 unsigned char type[256];
193 #ifdef XML_MIN_SIZE
194 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
195 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
196 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
197 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
198 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
199 #endif /* XML_MIN_SIZE */
200 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
203 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
204 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
205 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
206 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
207 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
208 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
209 };
210
211 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
212
213 #ifdef XML_MIN_SIZE
214
215 # define STANDARD_VTABLE(E) \
216 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
217
218 #else
219
220 # define STANDARD_VTABLE(E) /* as nothing */
221
222 #endif
223
224 #define NORMAL_VTABLE(E) \
225 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
226 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
227
228 #define NULL_VTABLE \
229 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
230 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
231 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
232
233 static int FASTCALL checkCharRefNumber(int result);
234
235 #include "xmltok_impl.h"
236 #include "ascii.h"
237
238 #ifdef XML_MIN_SIZE
239 # define sb_isNameMin isNever
240 # define sb_isNmstrtMin isNever
241 #endif
242
243 #ifdef XML_MIN_SIZE
244 # define MINBPC(enc) ((enc)->minBytesPerChar)
245 #else
246 /* minimum bytes per character */
247 # define MINBPC(enc) 1
248 #endif
249
250 #define SB_BYTE_TYPE(enc, p) \
251 (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
252
253 #ifdef XML_MIN_SIZE
254 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)255 sb_byteType(const ENCODING *enc, const char *p) {
256 return SB_BYTE_TYPE(enc, p);
257 }
258 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
259 #else
260 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
261 #endif
262
263 #ifdef XML_MIN_SIZE
264 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
265 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)266 sb_byteToAscii(const ENCODING *enc, const char *p) {
267 UNUSED_P(enc);
268 return *p;
269 }
270 #else
271 # define BYTE_TO_ASCII(enc, p) (*(p))
272 #endif
273
274 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
275 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
276 #ifdef XML_MIN_SIZE
277 # define IS_INVALID_CHAR(enc, p, n) \
278 (AS_NORMAL_ENCODING(enc)->isInvalid##n \
279 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
280 #else
281 # define IS_INVALID_CHAR(enc, p, n) \
282 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
283 #endif
284
285 #ifdef XML_MIN_SIZE
286 # define IS_NAME_CHAR_MINBPC(enc, p) \
287 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
288 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \
289 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
290 #else
291 # define IS_NAME_CHAR_MINBPC(enc, p) (0)
292 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
293 #endif
294
295 #ifdef XML_MIN_SIZE
296 # define CHAR_MATCHES(enc, p, c) \
297 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
298 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)299 sb_charMatches(const ENCODING *enc, const char *p, int c) {
300 UNUSED_P(enc);
301 return *p == c;
302 }
303 #else
304 /* c is an ASCII character */
305 # define CHAR_MATCHES(enc, p, c) (*(p) == (c))
306 #endif
307
308 #define PREFIX(ident) normal_##ident
309 #define XML_TOK_IMPL_C
310 #include "xmltok_impl.c"
311 #undef XML_TOK_IMPL_C
312
313 #undef MINBPC
314 #undef BYTE_TYPE
315 #undef BYTE_TO_ASCII
316 #undef CHAR_MATCHES
317 #undef IS_NAME_CHAR
318 #undef IS_NAME_CHAR_MINBPC
319 #undef IS_NMSTRT_CHAR
320 #undef IS_NMSTRT_CHAR_MINBPC
321 #undef IS_INVALID_CHAR
322
323 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
324 UTF8_cval1 = 0x00,
325 UTF8_cval2 = 0xc0,
326 UTF8_cval3 = 0xe0,
327 UTF8_cval4 = 0xf0
328 };
329
330 void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)331 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
332 const char **fromLimRef) {
333 const char *fromLim = *fromLimRef;
334 size_t walked = 0;
335 for (; fromLim > from; fromLim--, walked++) {
336 const unsigned char prev = (unsigned char)fromLim[-1];
337 if ((prev & 0xf8u)
338 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
339 if (walked + 1 >= 4) {
340 fromLim += 4 - 1;
341 break;
342 } else {
343 walked = 0;
344 }
345 } else if ((prev & 0xf0u)
346 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
347 if (walked + 1 >= 3) {
348 fromLim += 3 - 1;
349 break;
350 } else {
351 walked = 0;
352 }
353 } else if ((prev & 0xe0u)
354 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
355 if (walked + 1 >= 2) {
356 fromLim += 2 - 1;
357 break;
358 } else {
359 walked = 0;
360 }
361 } else if ((prev & 0x80u)
362 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
363 break;
364 }
365 }
366 *fromLimRef = fromLim;
367 }
368
369 static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)370 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
371 char **toP, const char *toLim) {
372 bool input_incomplete = false;
373 bool output_exhausted = false;
374
375 /* Avoid copying partial characters (due to limited space). */
376 const ptrdiff_t bytesAvailable = fromLim - *fromP;
377 const ptrdiff_t bytesStorable = toLim - *toP;
378 UNUSED_P(enc);
379 if (bytesAvailable > bytesStorable) {
380 fromLim = *fromP + bytesStorable;
381 output_exhausted = true;
382 }
383
384 /* Avoid copying partial characters (from incomplete input). */
385 {
386 const char *const fromLimBefore = fromLim;
387 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
388 if (fromLim < fromLimBefore) {
389 input_incomplete = true;
390 }
391 }
392
393 {
394 const ptrdiff_t bytesToCopy = fromLim - *fromP;
395 memcpy(*toP, *fromP, bytesToCopy);
396 *fromP += bytesToCopy;
397 *toP += bytesToCopy;
398 }
399
400 if (output_exhausted) /* needs to go first */
401 return XML_CONVERT_OUTPUT_EXHAUSTED;
402 else if (input_incomplete)
403 return XML_CONVERT_INPUT_INCOMPLETE;
404 else
405 return XML_CONVERT_COMPLETED;
406 }
407
408 static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)409 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
410 unsigned short **toP, const unsigned short *toLim) {
411 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
412 unsigned short *to = *toP;
413 const char *from = *fromP;
414 while (from < fromLim && to < toLim) {
415 switch (SB_BYTE_TYPE(enc, from)) {
416 case BT_LEAD2:
417 if (fromLim - from < 2) {
418 res = XML_CONVERT_INPUT_INCOMPLETE;
419 goto after;
420 }
421 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
422 from += 2;
423 break;
424 case BT_LEAD3:
425 if (fromLim - from < 3) {
426 res = XML_CONVERT_INPUT_INCOMPLETE;
427 goto after;
428 }
429 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
430 | (from[2] & 0x3f));
431 from += 3;
432 break;
433 case BT_LEAD4: {
434 unsigned long n;
435 if (toLim - to < 2) {
436 res = XML_CONVERT_OUTPUT_EXHAUSTED;
437 goto after;
438 }
439 if (fromLim - from < 4) {
440 res = XML_CONVERT_INPUT_INCOMPLETE;
441 goto after;
442 }
443 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
444 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
445 n -= 0x10000;
446 to[0] = (unsigned short)((n >> 10) | 0xD800);
447 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
448 to += 2;
449 from += 4;
450 } break;
451 default:
452 *to++ = *from++;
453 break;
454 }
455 }
456 if (from < fromLim)
457 res = XML_CONVERT_OUTPUT_EXHAUSTED;
458 after:
459 *fromP = from;
460 *toP = to;
461 return res;
462 }
463
464 #ifdef XML_NS
465 static const struct normal_encoding utf8_encoding_ns
466 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
467 {
468 # include "asciitab.h"
469 # include "utf8tab.h"
470 },
471 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
472 #endif
473
474 static const struct normal_encoding utf8_encoding
475 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
476 {
477 #define BT_COLON BT_NMSTRT
478 #include "asciitab.h"
479 #undef BT_COLON
480 #include "utf8tab.h"
481 },
482 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
483
484 #ifdef XML_NS
485
486 static const struct normal_encoding internal_utf8_encoding_ns
487 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
488 {
489 # include "iasciitab.h"
490 # include "utf8tab.h"
491 },
492 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
493
494 #endif
495
496 static const struct normal_encoding internal_utf8_encoding
497 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
498 {
499 #define BT_COLON BT_NMSTRT
500 #include "iasciitab.h"
501 #undef BT_COLON
502 #include "utf8tab.h"
503 },
504 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
505
506 static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)507 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
508 char **toP, const char *toLim) {
509 UNUSED_P(enc);
510 for (;;) {
511 unsigned char c;
512 if (*fromP == fromLim)
513 return XML_CONVERT_COMPLETED;
514 c = (unsigned char)**fromP;
515 if (c & 0x80) {
516 if (toLim - *toP < 2)
517 return XML_CONVERT_OUTPUT_EXHAUSTED;
518 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
519 *(*toP)++ = (char)((c & 0x3f) | 0x80);
520 (*fromP)++;
521 } else {
522 if (*toP == toLim)
523 return XML_CONVERT_OUTPUT_EXHAUSTED;
524 *(*toP)++ = *(*fromP)++;
525 }
526 }
527 }
528
529 static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)530 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
531 unsigned short **toP, const unsigned short *toLim) {
532 UNUSED_P(enc);
533 while (*fromP < fromLim && *toP < toLim)
534 *(*toP)++ = (unsigned char)*(*fromP)++;
535
536 if ((*toP == toLim) && (*fromP < fromLim))
537 return XML_CONVERT_OUTPUT_EXHAUSTED;
538 else
539 return XML_CONVERT_COMPLETED;
540 }
541
542 #ifdef XML_NS
543
544 static const struct normal_encoding latin1_encoding_ns
545 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
546 {
547 # include "asciitab.h"
548 # include "latin1tab.h"
549 },
550 STANDARD_VTABLE(sb_) NULL_VTABLE};
551
552 #endif
553
554 static const struct normal_encoding latin1_encoding
555 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
556 {
557 #define BT_COLON BT_NMSTRT
558 #include "asciitab.h"
559 #undef BT_COLON
560 #include "latin1tab.h"
561 },
562 STANDARD_VTABLE(sb_) NULL_VTABLE};
563
564 static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)565 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
566 char **toP, const char *toLim) {
567 UNUSED_P(enc);
568 while (*fromP < fromLim && *toP < toLim)
569 *(*toP)++ = *(*fromP)++;
570
571 if ((*toP == toLim) && (*fromP < fromLim))
572 return XML_CONVERT_OUTPUT_EXHAUSTED;
573 else
574 return XML_CONVERT_COMPLETED;
575 }
576
577 #ifdef XML_NS
578
579 static const struct normal_encoding ascii_encoding_ns
580 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
581 {
582 # include "asciitab.h"
583 /* BT_NONXML == 0 */
584 },
585 STANDARD_VTABLE(sb_) NULL_VTABLE};
586
587 #endif
588
589 static const struct normal_encoding ascii_encoding
590 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
591 {
592 #define BT_COLON BT_NMSTRT
593 #include "asciitab.h"
594 #undef BT_COLON
595 /* BT_NONXML == 0 */
596 },
597 STANDARD_VTABLE(sb_) NULL_VTABLE};
598
599 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)600 unicode_byte_type(char hi, char lo) {
601 switch ((unsigned char)hi) {
602 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
603 case 0xD8:
604 case 0xD9:
605 case 0xDA:
606 case 0xDB:
607 return BT_LEAD4;
608 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
609 case 0xDC:
610 case 0xDD:
611 case 0xDE:
612 case 0xDF:
613 return BT_TRAIL;
614 case 0xFF:
615 switch ((unsigned char)lo) {
616 case 0xFF: /* noncharacter-FFFF */
617 case 0xFE: /* noncharacter-FFFE */
618 return BT_NONXML;
619 }
620 break;
621 }
622 return BT_NONASCII;
623 }
624
625 #define DEFINE_UTF16_TO_UTF8(E) \
626 static enum XML_Convert_Result PTRCALL E##toUtf8( \
627 const ENCODING *enc, const char **fromP, const char *fromLim, \
628 char **toP, const char *toLim) { \
629 const char *from = *fromP; \
630 UNUSED_P(enc); \
631 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
632 for (; from < fromLim; from += 2) { \
633 int plane; \
634 unsigned char lo2; \
635 unsigned char lo = GET_LO(from); \
636 unsigned char hi = GET_HI(from); \
637 switch (hi) { \
638 case 0: \
639 if (lo < 0x80) { \
640 if (*toP == toLim) { \
641 *fromP = from; \
642 return XML_CONVERT_OUTPUT_EXHAUSTED; \
643 } \
644 *(*toP)++ = lo; \
645 break; \
646 } \
647 /* fall through */ \
648 case 0x1: \
649 case 0x2: \
650 case 0x3: \
651 case 0x4: \
652 case 0x5: \
653 case 0x6: \
654 case 0x7: \
655 if (toLim - *toP < 2) { \
656 *fromP = from; \
657 return XML_CONVERT_OUTPUT_EXHAUSTED; \
658 } \
659 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
660 *(*toP)++ = ((lo & 0x3f) | 0x80); \
661 break; \
662 default: \
663 if (toLim - *toP < 3) { \
664 *fromP = from; \
665 return XML_CONVERT_OUTPUT_EXHAUSTED; \
666 } \
667 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
668 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
669 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
670 *(*toP)++ = ((lo & 0x3f) | 0x80); \
671 break; \
672 case 0xD8: \
673 case 0xD9: \
674 case 0xDA: \
675 case 0xDB: \
676 if (toLim - *toP < 4) { \
677 *fromP = from; \
678 return XML_CONVERT_OUTPUT_EXHAUSTED; \
679 } \
680 if (fromLim - from < 4) { \
681 *fromP = from; \
682 return XML_CONVERT_INPUT_INCOMPLETE; \
683 } \
684 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
685 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
686 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
687 from += 2; \
688 lo2 = GET_LO(from); \
689 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
690 | (lo2 >> 6) | 0x80); \
691 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
692 break; \
693 } \
694 } \
695 *fromP = from; \
696 if (from < fromLim) \
697 return XML_CONVERT_INPUT_INCOMPLETE; \
698 else \
699 return XML_CONVERT_COMPLETED; \
700 }
701
702 #define DEFINE_UTF16_TO_UTF16(E) \
703 static enum XML_Convert_Result PTRCALL E##toUtf16( \
704 const ENCODING *enc, const char **fromP, const char *fromLim, \
705 unsigned short **toP, const unsigned short *toLim) { \
706 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
707 UNUSED_P(enc); \
708 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
709 /* Avoid copying first half only of surrogate */ \
710 if (fromLim - *fromP > ((toLim - *toP) << 1) \
711 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
712 fromLim -= 2; \
713 res = XML_CONVERT_INPUT_INCOMPLETE; \
714 } \
715 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
716 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
717 if ((*toP == toLim) && (*fromP < fromLim)) \
718 return XML_CONVERT_OUTPUT_EXHAUSTED; \
719 else \
720 return res; \
721 }
722
723 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
724 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
725
726 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)727 DEFINE_UTF16_TO_UTF16(little2_)
728
729 #undef GET_LO
730 #undef GET_HI
731
732 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
733 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
734
735 DEFINE_UTF16_TO_UTF8(big2_)
736 DEFINE_UTF16_TO_UTF16(big2_)
737
738 #undef GET_LO
739 #undef GET_HI
740
741 #define LITTLE2_BYTE_TYPE(enc, p) \
742 ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
743 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
744 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
745 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
746 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
747 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
748 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
749
750 #ifdef XML_MIN_SIZE
751
752 static int PTRFASTCALL
753 little2_byteType(const ENCODING *enc, const char *p) {
754 return LITTLE2_BYTE_TYPE(enc, p);
755 }
756
757 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)758 little2_byteToAscii(const ENCODING *enc, const char *p) {
759 UNUSED_P(enc);
760 return LITTLE2_BYTE_TO_ASCII(p);
761 }
762
763 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)764 little2_charMatches(const ENCODING *enc, const char *p, int c) {
765 UNUSED_P(enc);
766 return LITTLE2_CHAR_MATCHES(p, c);
767 }
768
769 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)770 little2_isNameMin(const ENCODING *enc, const char *p) {
771 UNUSED_P(enc);
772 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
773 }
774
775 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)776 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
777 UNUSED_P(enc);
778 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
779 }
780
781 # undef VTABLE
782 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
783
784 #else /* not XML_MIN_SIZE */
785
786 # undef PREFIX
787 # define PREFIX(ident) little2_##ident
788 # define MINBPC(enc) 2
789 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
790 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
791 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
792 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
793 # define IS_NAME_CHAR(enc, p, n) 0
794 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
795 # define IS_NMSTRT_CHAR(enc, p, n) (0)
796 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
797
798 # define XML_TOK_IMPL_C
799 # include "xmltok_impl.c"
800 # undef XML_TOK_IMPL_C
801
802 # undef MINBPC
803 # undef BYTE_TYPE
804 # undef BYTE_TO_ASCII
805 # undef CHAR_MATCHES
806 # undef IS_NAME_CHAR
807 # undef IS_NAME_CHAR_MINBPC
808 # undef IS_NMSTRT_CHAR
809 # undef IS_NMSTRT_CHAR_MINBPC
810 # undef IS_INVALID_CHAR
811
812 #endif /* not XML_MIN_SIZE */
813
814 #ifdef XML_NS
815
816 static const struct normal_encoding little2_encoding_ns
817 = {{VTABLE, 2, 0,
818 # if BYTEORDER == 1234
819 1
820 # else
821 0
822 # endif
823 },
824 {
825 # include "asciitab.h"
826 # include "latin1tab.h"
827 },
828 STANDARD_VTABLE(little2_) NULL_VTABLE};
829
830 #endif
831
832 static const struct normal_encoding little2_encoding
833 = {{VTABLE, 2, 0,
834 #if BYTEORDER == 1234
835 1
836 #else
837 0
838 #endif
839 },
840 {
841 #define BT_COLON BT_NMSTRT
842 #include "asciitab.h"
843 #undef BT_COLON
844 #include "latin1tab.h"
845 },
846 STANDARD_VTABLE(little2_) NULL_VTABLE};
847
848 #if BYTEORDER != 4321
849
850 # ifdef XML_NS
851
852 static const struct normal_encoding internal_little2_encoding_ns
853 = {{VTABLE, 2, 0, 1},
854 {
855 # include "iasciitab.h"
856 # include "latin1tab.h"
857 },
858 STANDARD_VTABLE(little2_) NULL_VTABLE};
859
860 # endif
861
862 static const struct normal_encoding internal_little2_encoding
863 = {{VTABLE, 2, 0, 1},
864 {
865 # define BT_COLON BT_NMSTRT
866 # include "iasciitab.h"
867 # undef BT_COLON
868 # include "latin1tab.h"
869 },
870 STANDARD_VTABLE(little2_) NULL_VTABLE};
871
872 #endif
873
874 #define BIG2_BYTE_TYPE(enc, p) \
875 ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
876 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
877 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
878 #define BIG2_IS_NAME_CHAR_MINBPC(p) \
879 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
880 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
881 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
882
883 #ifdef XML_MIN_SIZE
884
885 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)886 big2_byteType(const ENCODING *enc, const char *p) {
887 return BIG2_BYTE_TYPE(enc, p);
888 }
889
890 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)891 big2_byteToAscii(const ENCODING *enc, const char *p) {
892 UNUSED_P(enc);
893 return BIG2_BYTE_TO_ASCII(p);
894 }
895
896 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)897 big2_charMatches(const ENCODING *enc, const char *p, int c) {
898 UNUSED_P(enc);
899 return BIG2_CHAR_MATCHES(p, c);
900 }
901
902 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)903 big2_isNameMin(const ENCODING *enc, const char *p) {
904 UNUSED_P(enc);
905 return BIG2_IS_NAME_CHAR_MINBPC(p);
906 }
907
908 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)909 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
910 UNUSED_P(enc);
911 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
912 }
913
914 # undef VTABLE
915 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
916
917 #else /* not XML_MIN_SIZE */
918
919 # undef PREFIX
920 # define PREFIX(ident) big2_##ident
921 # define MINBPC(enc) 2
922 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
923 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
924 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
925 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
926 # define IS_NAME_CHAR(enc, p, n) 0
927 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
928 # define IS_NMSTRT_CHAR(enc, p, n) (0)
929 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
930
931 # define XML_TOK_IMPL_C
932 # include "xmltok_impl.c"
933 # undef XML_TOK_IMPL_C
934
935 # undef MINBPC
936 # undef BYTE_TYPE
937 # undef BYTE_TO_ASCII
938 # undef CHAR_MATCHES
939 # undef IS_NAME_CHAR
940 # undef IS_NAME_CHAR_MINBPC
941 # undef IS_NMSTRT_CHAR
942 # undef IS_NMSTRT_CHAR_MINBPC
943 # undef IS_INVALID_CHAR
944
945 #endif /* not XML_MIN_SIZE */
946
947 #ifdef XML_NS
948
949 static const struct normal_encoding big2_encoding_ns
950 = {{VTABLE, 2, 0,
951 # if BYTEORDER == 4321
952 1
953 # else
954 0
955 # endif
956 },
957 {
958 # include "asciitab.h"
959 # include "latin1tab.h"
960 },
961 STANDARD_VTABLE(big2_) NULL_VTABLE};
962
963 #endif
964
965 static const struct normal_encoding big2_encoding
966 = {{VTABLE, 2, 0,
967 #if BYTEORDER == 4321
968 1
969 #else
970 0
971 #endif
972 },
973 {
974 #define BT_COLON BT_NMSTRT
975 #include "asciitab.h"
976 #undef BT_COLON
977 #include "latin1tab.h"
978 },
979 STANDARD_VTABLE(big2_) NULL_VTABLE};
980
981 #if BYTEORDER != 1234
982
983 # ifdef XML_NS
984
985 static const struct normal_encoding internal_big2_encoding_ns
986 = {{VTABLE, 2, 0, 1},
987 {
988 # include "iasciitab.h"
989 # include "latin1tab.h"
990 },
991 STANDARD_VTABLE(big2_) NULL_VTABLE};
992
993 # endif
994
995 static const struct normal_encoding internal_big2_encoding
996 = {{VTABLE, 2, 0, 1},
997 {
998 # define BT_COLON BT_NMSTRT
999 # include "iasciitab.h"
1000 # undef BT_COLON
1001 # include "latin1tab.h"
1002 },
1003 STANDARD_VTABLE(big2_) NULL_VTABLE};
1004
1005 #endif
1006
1007 #undef PREFIX
1008
1009 static int FASTCALL
streqci(const char * s1,const char * s2)1010 streqci(const char *s1, const char *s2) {
1011 for (;;) {
1012 char c1 = *s1++;
1013 char c2 = *s2++;
1014 if (ASCII_a <= c1 && c1 <= ASCII_z)
1015 c1 += ASCII_A - ASCII_a;
1016 if (ASCII_a <= c2 && c2 <= ASCII_z)
1017 /* The following line will never get executed. streqci() is
1018 * only called from two places, both of which guarantee to put
1019 * upper-case strings into s2.
1020 */
1021 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1022 if (c1 != c2)
1023 return 0;
1024 if (! c1)
1025 break;
1026 }
1027 return 1;
1028 }
1029
1030 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)1031 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1032 POSITION *pos) {
1033 UNUSED_P(enc);
1034 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1035 }
1036
1037 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)1038 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1039 char buf[1];
1040 char *p = buf;
1041 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1042 if (p == buf)
1043 return -1;
1044 else
1045 return buf[0];
1046 }
1047
1048 static int FASTCALL
isSpace(int c)1049 isSpace(int c) {
1050 switch (c) {
1051 case 0x20:
1052 case 0xD:
1053 case 0xA:
1054 case 0x9:
1055 return 1;
1056 }
1057 return 0;
1058 }
1059
1060 /* Return 1 if there's just optional white space or there's an S
1061 followed by name=val.
1062 */
1063 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1064 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1065 const char **namePtr, const char **nameEndPtr,
1066 const char **valPtr, const char **nextTokPtr) {
1067 int c;
1068 char open;
1069 if (ptr == end) {
1070 *namePtr = NULL;
1071 return 1;
1072 }
1073 if (! isSpace(toAscii(enc, ptr, end))) {
1074 *nextTokPtr = ptr;
1075 return 0;
1076 }
1077 do {
1078 ptr += enc->minBytesPerChar;
1079 } while (isSpace(toAscii(enc, ptr, end)));
1080 if (ptr == end) {
1081 *namePtr = NULL;
1082 return 1;
1083 }
1084 *namePtr = ptr;
1085 for (;;) {
1086 c = toAscii(enc, ptr, end);
1087 if (c == -1) {
1088 *nextTokPtr = ptr;
1089 return 0;
1090 }
1091 if (c == ASCII_EQUALS) {
1092 *nameEndPtr = ptr;
1093 break;
1094 }
1095 if (isSpace(c)) {
1096 *nameEndPtr = ptr;
1097 do {
1098 ptr += enc->minBytesPerChar;
1099 } while (isSpace(c = toAscii(enc, ptr, end)));
1100 if (c != ASCII_EQUALS) {
1101 *nextTokPtr = ptr;
1102 return 0;
1103 }
1104 break;
1105 }
1106 ptr += enc->minBytesPerChar;
1107 }
1108 if (ptr == *namePtr) {
1109 *nextTokPtr = ptr;
1110 return 0;
1111 }
1112 ptr += enc->minBytesPerChar;
1113 c = toAscii(enc, ptr, end);
1114 while (isSpace(c)) {
1115 ptr += enc->minBytesPerChar;
1116 c = toAscii(enc, ptr, end);
1117 }
1118 if (c != ASCII_QUOT && c != ASCII_APOS) {
1119 *nextTokPtr = ptr;
1120 return 0;
1121 }
1122 open = (char)c;
1123 ptr += enc->minBytesPerChar;
1124 *valPtr = ptr;
1125 for (;; ptr += enc->minBytesPerChar) {
1126 c = toAscii(enc, ptr, end);
1127 if (c == open)
1128 break;
1129 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1130 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1131 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1132 *nextTokPtr = ptr;
1133 return 0;
1134 }
1135 }
1136 *nextTokPtr = ptr + enc->minBytesPerChar;
1137 return 1;
1138 }
1139
1140 static const char KW_version[]
1141 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1142
1143 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1144 ASCII_i, ASCII_n, ASCII_g, '\0'};
1145
1146 static const char KW_standalone[]
1147 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1148 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1149
1150 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1151
1152 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1153
1154 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1155 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1156 const char *),
1157 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1158 const char *end, const char **badPtr, const char **versionPtr,
1159 const char **versionEndPtr, const char **encodingName,
1160 const ENCODING **encoding, int *standalone) {
1161 const char *val = NULL;
1162 const char *name = NULL;
1163 const char *nameEnd = NULL;
1164 ptr += 5 * enc->minBytesPerChar;
1165 end -= 2 * enc->minBytesPerChar;
1166 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1167 || ! name) {
1168 *badPtr = ptr;
1169 return 0;
1170 }
1171 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1172 if (! isGeneralTextEntity) {
1173 *badPtr = name;
1174 return 0;
1175 }
1176 } else {
1177 if (versionPtr)
1178 *versionPtr = val;
1179 if (versionEndPtr)
1180 *versionEndPtr = ptr;
1181 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1182 *badPtr = ptr;
1183 return 0;
1184 }
1185 if (! name) {
1186 if (isGeneralTextEntity) {
1187 /* a TextDecl must have an EncodingDecl */
1188 *badPtr = ptr;
1189 return 0;
1190 }
1191 return 1;
1192 }
1193 }
1194 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1195 int c = toAscii(enc, val, end);
1196 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1197 *badPtr = val;
1198 return 0;
1199 }
1200 if (encodingName)
1201 *encodingName = val;
1202 if (encoding)
1203 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1204 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1205 *badPtr = ptr;
1206 return 0;
1207 }
1208 if (! name)
1209 return 1;
1210 }
1211 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1212 || isGeneralTextEntity) {
1213 *badPtr = name;
1214 return 0;
1215 }
1216 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1217 if (standalone)
1218 *standalone = 1;
1219 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1220 if (standalone)
1221 *standalone = 0;
1222 } else {
1223 *badPtr = val;
1224 return 0;
1225 }
1226 while (isSpace(toAscii(enc, ptr, end)))
1227 ptr += enc->minBytesPerChar;
1228 if (ptr != end) {
1229 *badPtr = ptr;
1230 return 0;
1231 }
1232 return 1;
1233 }
1234
1235 static int FASTCALL
checkCharRefNumber(int result)1236 checkCharRefNumber(int result) {
1237 switch (result >> 8) {
1238 case 0xD8:
1239 case 0xD9:
1240 case 0xDA:
1241 case 0xDB:
1242 case 0xDC:
1243 case 0xDD:
1244 case 0xDE:
1245 case 0xDF:
1246 return -1;
1247 case 0:
1248 if (latin1_encoding.type[result] == BT_NONXML)
1249 return -1;
1250 break;
1251 case 0xFF:
1252 if (result == 0xFFFE || result == 0xFFFF)
1253 return -1;
1254 break;
1255 }
1256 return result;
1257 }
1258
1259 int FASTCALL
XmlUtf8Encode(int c,char * buf)1260 XmlUtf8Encode(int c, char *buf) {
1261 enum {
1262 /* minN is minimum legal resulting value for N byte sequence */
1263 min2 = 0x80,
1264 min3 = 0x800,
1265 min4 = 0x10000
1266 };
1267
1268 if (c < 0)
1269 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1270 if (c < min2) {
1271 buf[0] = (char)(c | UTF8_cval1);
1272 return 1;
1273 }
1274 if (c < min3) {
1275 buf[0] = (char)((c >> 6) | UTF8_cval2);
1276 buf[1] = (char)((c & 0x3f) | 0x80);
1277 return 2;
1278 }
1279 if (c < min4) {
1280 buf[0] = (char)((c >> 12) | UTF8_cval3);
1281 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1282 buf[2] = (char)((c & 0x3f) | 0x80);
1283 return 3;
1284 }
1285 if (c < 0x110000) {
1286 buf[0] = (char)((c >> 18) | UTF8_cval4);
1287 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1288 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1289 buf[3] = (char)((c & 0x3f) | 0x80);
1290 return 4;
1291 }
1292 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1293 }
1294
1295 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1296 XmlUtf16Encode(int charNum, unsigned short *buf) {
1297 if (charNum < 0)
1298 return 0;
1299 if (charNum < 0x10000) {
1300 buf[0] = (unsigned short)charNum;
1301 return 1;
1302 }
1303 if (charNum < 0x110000) {
1304 charNum -= 0x10000;
1305 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1306 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1307 return 2;
1308 }
1309 return 0;
1310 }
1311
1312 struct unknown_encoding {
1313 struct normal_encoding normal;
1314 CONVERTER convert;
1315 void *userData;
1316 unsigned short utf16[256];
1317 char utf8[256][4];
1318 };
1319
1320 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1321
1322 int
XmlSizeOfUnknownEncoding(void)1323 XmlSizeOfUnknownEncoding(void) {
1324 return sizeof(struct unknown_encoding);
1325 }
1326
1327 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1328 unknown_isName(const ENCODING *enc, const char *p) {
1329 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1330 int c = uenc->convert(uenc->userData, p);
1331 if (c & ~0xFFFF)
1332 return 0;
1333 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1334 }
1335
1336 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1337 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1338 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1339 int c = uenc->convert(uenc->userData, p);
1340 if (c & ~0xFFFF)
1341 return 0;
1342 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1343 }
1344
1345 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1346 unknown_isInvalid(const ENCODING *enc, const char *p) {
1347 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1348 int c = uenc->convert(uenc->userData, p);
1349 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1350 }
1351
1352 static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1353 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1354 char **toP, const char *toLim) {
1355 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1356 char buf[XML_UTF8_ENCODE_MAX];
1357 for (;;) {
1358 const char *utf8;
1359 int n;
1360 if (*fromP == fromLim)
1361 return XML_CONVERT_COMPLETED;
1362 utf8 = uenc->utf8[(unsigned char)**fromP];
1363 n = *utf8++;
1364 if (n == 0) {
1365 int c = uenc->convert(uenc->userData, *fromP);
1366 n = XmlUtf8Encode(c, buf);
1367 if (n > toLim - *toP)
1368 return XML_CONVERT_OUTPUT_EXHAUSTED;
1369 utf8 = buf;
1370 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1371 - (BT_LEAD2 - 2));
1372 } else {
1373 if (n > toLim - *toP)
1374 return XML_CONVERT_OUTPUT_EXHAUSTED;
1375 (*fromP)++;
1376 }
1377 memcpy(*toP, utf8, n);
1378 *toP += n;
1379 }
1380 }
1381
1382 static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1383 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1384 unsigned short **toP, const unsigned short *toLim) {
1385 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1386 while (*fromP < fromLim && *toP < toLim) {
1387 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1388 if (c == 0) {
1389 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1390 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1391 - (BT_LEAD2 - 2));
1392 } else
1393 (*fromP)++;
1394 *(*toP)++ = c;
1395 }
1396
1397 if ((*toP == toLim) && (*fromP < fromLim))
1398 return XML_CONVERT_OUTPUT_EXHAUSTED;
1399 else
1400 return XML_CONVERT_COMPLETED;
1401 }
1402
1403 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1404 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1405 void *userData) {
1406 int i;
1407 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1408 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1409 for (i = 0; i < 128; i++)
1410 if (latin1_encoding.type[i] != BT_OTHER
1411 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1412 return 0;
1413 for (i = 0; i < 256; i++) {
1414 int c = table[i];
1415 if (c == -1) {
1416 e->normal.type[i] = BT_MALFORM;
1417 /* This shouldn't really get used. */
1418 e->utf16[i] = 0xFFFF;
1419 e->utf8[i][0] = 1;
1420 e->utf8[i][1] = 0;
1421 } else if (c < 0) {
1422 if (c < -4)
1423 return 0;
1424 /* Multi-byte sequences need a converter function */
1425 if (! convert)
1426 return 0;
1427 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1428 e->utf8[i][0] = 0;
1429 e->utf16[i] = 0;
1430 } else if (c < 0x80) {
1431 if (latin1_encoding.type[c] != BT_OTHER
1432 && latin1_encoding.type[c] != BT_NONXML && c != i)
1433 return 0;
1434 e->normal.type[i] = latin1_encoding.type[c];
1435 e->utf8[i][0] = 1;
1436 e->utf8[i][1] = (char)c;
1437 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1438 } else if (checkCharRefNumber(c) < 0) {
1439 e->normal.type[i] = BT_NONXML;
1440 /* This shouldn't really get used. */
1441 e->utf16[i] = 0xFFFF;
1442 e->utf8[i][0] = 1;
1443 e->utf8[i][1] = 0;
1444 } else {
1445 if (c > 0xFFFF)
1446 return 0;
1447 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1448 e->normal.type[i] = BT_NMSTRT;
1449 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1450 e->normal.type[i] = BT_NAME;
1451 else
1452 e->normal.type[i] = BT_OTHER;
1453 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1454 e->utf16[i] = (unsigned short)c;
1455 }
1456 }
1457 e->userData = userData;
1458 e->convert = convert;
1459 if (convert) {
1460 e->normal.isName2 = unknown_isName;
1461 e->normal.isName3 = unknown_isName;
1462 e->normal.isName4 = unknown_isName;
1463 e->normal.isNmstrt2 = unknown_isNmstrt;
1464 e->normal.isNmstrt3 = unknown_isNmstrt;
1465 e->normal.isNmstrt4 = unknown_isNmstrt;
1466 e->normal.isInvalid2 = unknown_isInvalid;
1467 e->normal.isInvalid3 = unknown_isInvalid;
1468 e->normal.isInvalid4 = unknown_isInvalid;
1469 }
1470 e->normal.enc.utf8Convert = unknown_toUtf8;
1471 e->normal.enc.utf16Convert = unknown_toUtf16;
1472 return &(e->normal.enc);
1473 }
1474
1475 /* If this enumeration is changed, getEncodingIndex and encodings
1476 must also be changed. */
1477 enum {
1478 UNKNOWN_ENC = -1,
1479 ISO_8859_1_ENC = 0,
1480 US_ASCII_ENC,
1481 UTF_8_ENC,
1482 UTF_16_ENC,
1483 UTF_16BE_ENC,
1484 UTF_16LE_ENC,
1485 /* must match encodingNames up to here */
1486 NO_ENC
1487 };
1488
1489 static const char KW_ISO_8859_1[]
1490 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1491 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1492 static const char KW_US_ASCII[]
1493 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1494 ASCII_C, ASCII_I, ASCII_I, '\0'};
1495 static const char KW_UTF_8[]
1496 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1497 static const char KW_UTF_16[]
1498 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1499 static const char KW_UTF_16BE[]
1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501 ASCII_6, ASCII_B, ASCII_E, '\0'};
1502 static const char KW_UTF_16LE[]
1503 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1504 ASCII_6, ASCII_L, ASCII_E, '\0'};
1505
1506 static int FASTCALL
getEncodingIndex(const char * name)1507 getEncodingIndex(const char *name) {
1508 static const char *const encodingNames[] = {
1509 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1510 };
1511 int i;
1512 if (name == NULL)
1513 return NO_ENC;
1514 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1515 if (streqci(name, encodingNames[i]))
1516 return i;
1517 return UNKNOWN_ENC;
1518 }
1519
1520 /* For binary compatibility, we store the index of the encoding
1521 specified at initialization in the isUtf16 member.
1522 */
1523
1524 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1525 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1526
1527 /* This is what detects the encoding. encodingTable maps from
1528 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1529 the external (protocol) specified encoding; state is
1530 XML_CONTENT_STATE if we're parsing an external text entity, and
1531 XML_PROLOG_STATE otherwise.
1532 */
1533
1534 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1535 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1536 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1537 const ENCODING **encPtr;
1538
1539 if (ptr >= end)
1540 return XML_TOK_NONE;
1541 encPtr = enc->encPtr;
1542 if (ptr + 1 == end) {
1543 /* only a single byte available for auto-detection */
1544 #ifndef XML_DTD /* FIXME */
1545 /* a well-formed document entity must have more than one byte */
1546 if (state != XML_CONTENT_STATE)
1547 return XML_TOK_PARTIAL;
1548 #endif
1549 /* so we're parsing an external text entity... */
1550 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1551 switch (INIT_ENC_INDEX(enc)) {
1552 case UTF_16_ENC:
1553 case UTF_16LE_ENC:
1554 case UTF_16BE_ENC:
1555 return XML_TOK_PARTIAL;
1556 }
1557 switch ((unsigned char)*ptr) {
1558 case 0xFE:
1559 case 0xFF:
1560 case 0xEF: /* possibly first byte of UTF-8 BOM */
1561 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1562 break;
1563 /* fall through */
1564 case 0x00:
1565 case 0x3C:
1566 return XML_TOK_PARTIAL;
1567 }
1568 } else {
1569 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1570 case 0xFEFF:
1571 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1572 break;
1573 *nextTokPtr = ptr + 2;
1574 *encPtr = encodingTable[UTF_16BE_ENC];
1575 return XML_TOK_BOM;
1576 /* 00 3C is handled in the default case */
1577 case 0x3C00:
1578 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1579 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1580 && state == XML_CONTENT_STATE)
1581 break;
1582 *encPtr = encodingTable[UTF_16LE_ENC];
1583 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1584 case 0xFFFE:
1585 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1586 break;
1587 *nextTokPtr = ptr + 2;
1588 *encPtr = encodingTable[UTF_16LE_ENC];
1589 return XML_TOK_BOM;
1590 case 0xEFBB:
1591 /* Maybe a UTF-8 BOM (EF BB BF) */
1592 /* If there's an explicitly specified (external) encoding
1593 of ISO-8859-1 or some flavour of UTF-16
1594 and this is an external text entity,
1595 don't look for the BOM,
1596 because it might be a legal data.
1597 */
1598 if (state == XML_CONTENT_STATE) {
1599 int e = INIT_ENC_INDEX(enc);
1600 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1601 || e == UTF_16_ENC)
1602 break;
1603 }
1604 if (ptr + 2 == end)
1605 return XML_TOK_PARTIAL;
1606 if ((unsigned char)ptr[2] == 0xBF) {
1607 *nextTokPtr = ptr + 3;
1608 *encPtr = encodingTable[UTF_8_ENC];
1609 return XML_TOK_BOM;
1610 }
1611 break;
1612 default:
1613 if (ptr[0] == '\0') {
1614 /* 0 isn't a legal data character. Furthermore a document
1615 entity can only start with ASCII characters. So the only
1616 way this can fail to be big-endian UTF-16 if it it's an
1617 external parsed general entity that's labelled as
1618 UTF-16LE.
1619 */
1620 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1621 break;
1622 *encPtr = encodingTable[UTF_16BE_ENC];
1623 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1624 } else if (ptr[1] == '\0') {
1625 /* We could recover here in the case:
1626 - parsing an external entity
1627 - second byte is 0
1628 - no externally specified encoding
1629 - no encoding declaration
1630 by assuming UTF-16LE. But we don't, because this would mean when
1631 presented just with a single byte, we couldn't reliably determine
1632 whether we needed further bytes.
1633 */
1634 if (state == XML_CONTENT_STATE)
1635 break;
1636 *encPtr = encodingTable[UTF_16LE_ENC];
1637 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1638 }
1639 break;
1640 }
1641 }
1642 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1643 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1644 }
1645
1646 #define NS(x) x
1647 #define ns(x) x
1648 #define XML_TOK_NS_C
1649 #include "xmltok_ns.c"
1650 #undef XML_TOK_NS_C
1651 #undef NS
1652 #undef ns
1653
1654 #ifdef XML_NS
1655
1656 # define NS(x) x##NS
1657 # define ns(x) x##_ns
1658
1659 # define XML_TOK_NS_C
1660 # include "xmltok_ns.c"
1661 # undef XML_TOK_NS_C
1662
1663 # undef NS
1664 # undef ns
1665
1666 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1667 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1668 void *userData) {
1669 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1670 if (enc)
1671 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1672 return enc;
1673 }
1674
1675 #endif /* XML_NS */
1676
1677 #endif /* LV_USE_XML */
1678
1679