1 /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2002 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
14 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
15 Copyright (c) 2018 Benjamin Peterson <benjamin@python.org>
16 Copyright (c) 2018 Anton Maklakov <antmak.pub@gmail.com>
17 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
18 Copyright (c) 2020 Boris Kolpackov <boris@codesynthesis.com>
19 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com>
20 Licensed under the MIT license:
21
22 Permission is hereby granted, free of charge, to any person obtaining
23 a copy of this software and associated documentation files (the
24 "Software"), to deal in the Software without restriction, including
25 without limitation the rights to use, copy, modify, merge, publish,
26 distribute, sublicense, and/or sell copies of the Software, and to permit
27 persons to whom the Software is furnished to do so, subject to the
28 following conditions:
29
30 The above copyright notice and this permission notice shall be included
31 in all copies or substantial portions of the Software.
32
33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
34 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
36 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
37 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
38 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
39 USE OR OTHER DEALINGS IN THE SOFTWARE.
40 */
41
42 #include "../../lv_conf_internal.h"
43 #if LV_USE_XML
44
45 #ifdef XML_TOK_IMPL_C
46
47 # ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
48 # define IS_INVALID_CHAR(enc, ptr, n) (0)
49 # endif
50
51 # define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
52 case BT_LEAD##n: \
53 if (end - ptr < n) \
54 return XML_TOK_PARTIAL_CHAR; \
55 if (IS_INVALID_CHAR(enc, ptr, n)) { \
56 *(nextTokPtr) = (ptr); \
57 return XML_TOK_INVALID; \
58 } \
59 ptr += n; \
60 break;
61
62 # define INVALID_CASES(ptr, nextTokPtr) \
63 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
64 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
65 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
66 case BT_NONXML: \
67 case BT_MALFORM: \
68 case BT_TRAIL: \
69 *(nextTokPtr) = (ptr); \
70 return XML_TOK_INVALID;
71
72 # define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
73 case BT_LEAD##n: \
74 if (end - ptr < n) \
75 return XML_TOK_PARTIAL_CHAR; \
76 if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
77 *nextTokPtr = ptr; \
78 return XML_TOK_INVALID; \
79 } \
80 ptr += n; \
81 break;
82
83 # define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
84 case BT_NONASCII: \
85 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
86 *nextTokPtr = ptr; \
87 return XML_TOK_INVALID; \
88 } \
89 /* fall through */ \
90 case BT_NMSTRT: \
91 case BT_HEX: \
92 case BT_DIGIT: \
93 case BT_NAME: \
94 case BT_MINUS: \
95 ptr += MINBPC(enc); \
96 break; \
97 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
98 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
99 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
100
101 # define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
102 case BT_LEAD##n: \
103 if ((end) - (ptr) < (n)) \
104 return XML_TOK_PARTIAL_CHAR; \
105 if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
106 *nextTokPtr = ptr; \
107 return XML_TOK_INVALID; \
108 } \
109 ptr += n; \
110 break;
111
112 # define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
113 case BT_NONASCII: \
114 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
115 *nextTokPtr = ptr; \
116 return XML_TOK_INVALID; \
117 } \
118 /* fall through */ \
119 case BT_NMSTRT: \
120 case BT_HEX: \
121 ptr += MINBPC(enc); \
122 break; \
123 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
124 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
125 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
126
127 # ifndef PREFIX
128 # define PREFIX(ident) ident
129 # endif
130
131 # define HAS_CHARS(enc, ptr, end, count) \
132 ((end) - (ptr) >= ((count) * MINBPC(enc)))
133
134 # define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
135
136 # define REQUIRE_CHARS(enc, ptr, end, count) \
137 { \
138 if (! HAS_CHARS(enc, ptr, end, count)) { \
139 return XML_TOK_PARTIAL; \
140 } \
141 }
142
143 # define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
144
145 /* ptr points to character following "<!-" */
146
147 static int PTRCALL
PREFIX(scanComment)148 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
149 const char **nextTokPtr) {
150 if (HAS_CHAR(enc, ptr, end)) {
151 if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
152 *nextTokPtr = ptr;
153 return XML_TOK_INVALID;
154 }
155 ptr += MINBPC(enc);
156 while (HAS_CHAR(enc, ptr, end)) {
157 switch (BYTE_TYPE(enc, ptr)) {
158 INVALID_CASES(ptr, nextTokPtr)
159 case BT_MINUS:
160 ptr += MINBPC(enc);
161 REQUIRE_CHAR(enc, ptr, end);
162 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
163 ptr += MINBPC(enc);
164 REQUIRE_CHAR(enc, ptr, end);
165 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
166 *nextTokPtr = ptr;
167 return XML_TOK_INVALID;
168 }
169 *nextTokPtr = ptr + MINBPC(enc);
170 return XML_TOK_COMMENT;
171 }
172 break;
173 default:
174 ptr += MINBPC(enc);
175 break;
176 }
177 }
178 }
179 return XML_TOK_PARTIAL;
180 }
181
182 /* ptr points to character following "<!" */
183
184 static int PTRCALL
PREFIX(scanDecl)185 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
186 const char **nextTokPtr) {
187 REQUIRE_CHAR(enc, ptr, end);
188 switch (BYTE_TYPE(enc, ptr)) {
189 case BT_MINUS:
190 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
191 case BT_LSQB:
192 *nextTokPtr = ptr + MINBPC(enc);
193 return XML_TOK_COND_SECT_OPEN;
194 case BT_NMSTRT:
195 case BT_HEX:
196 ptr += MINBPC(enc);
197 break;
198 default:
199 *nextTokPtr = ptr;
200 return XML_TOK_INVALID;
201 }
202 while (HAS_CHAR(enc, ptr, end)) {
203 switch (BYTE_TYPE(enc, ptr)) {
204 case BT_PERCNT:
205 REQUIRE_CHARS(enc, ptr, end, 2);
206 /* don't allow <!ENTITY% foo "whatever"> */
207 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
208 case BT_S:
209 case BT_CR:
210 case BT_LF:
211 case BT_PERCNT:
212 *nextTokPtr = ptr;
213 return XML_TOK_INVALID;
214 }
215 /* fall through */
216 case BT_S:
217 case BT_CR:
218 case BT_LF:
219 *nextTokPtr = ptr;
220 return XML_TOK_DECL_OPEN;
221 case BT_NMSTRT:
222 case BT_HEX:
223 ptr += MINBPC(enc);
224 break;
225 default:
226 *nextTokPtr = ptr;
227 return XML_TOK_INVALID;
228 }
229 }
230 return XML_TOK_PARTIAL;
231 }
232
233 static int PTRCALL
PREFIX(checkPiTarget)234 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
235 int *tokPtr) {
236 int upper = 0;
237 UNUSED_P(enc);
238 *tokPtr = XML_TOK_PI;
239 if (end - ptr != MINBPC(enc) * 3)
240 return 1;
241 switch (BYTE_TO_ASCII(enc, ptr)) {
242 case ASCII_x:
243 break;
244 case ASCII_X:
245 upper = 1;
246 break;
247 default:
248 return 1;
249 }
250 ptr += MINBPC(enc);
251 switch (BYTE_TO_ASCII(enc, ptr)) {
252 case ASCII_m:
253 break;
254 case ASCII_M:
255 upper = 1;
256 break;
257 default:
258 return 1;
259 }
260 ptr += MINBPC(enc);
261 switch (BYTE_TO_ASCII(enc, ptr)) {
262 case ASCII_l:
263 break;
264 case ASCII_L:
265 upper = 1;
266 break;
267 default:
268 return 1;
269 }
270 if (upper)
271 return 0;
272 *tokPtr = XML_TOK_XML_DECL;
273 return 1;
274 }
275
276 /* ptr points to character following "<?" */
277
278 static int PTRCALL
PREFIX(scanPi)279 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
280 const char **nextTokPtr) {
281 int tok;
282 const char *target = ptr;
283 REQUIRE_CHAR(enc, ptr, end);
284 switch (BYTE_TYPE(enc, ptr)) {
285 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
286 default:
287 *nextTokPtr = ptr;
288 return XML_TOK_INVALID;
289 }
290 while (HAS_CHAR(enc, ptr, end)) {
291 switch (BYTE_TYPE(enc, ptr)) {
292 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
293 case BT_S:
294 case BT_CR:
295 case BT_LF:
296 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
297 *nextTokPtr = ptr;
298 return XML_TOK_INVALID;
299 }
300 ptr += MINBPC(enc);
301 while (HAS_CHAR(enc, ptr, end)) {
302 switch (BYTE_TYPE(enc, ptr)) {
303 INVALID_CASES(ptr, nextTokPtr)
304 case BT_QUEST:
305 ptr += MINBPC(enc);
306 REQUIRE_CHAR(enc, ptr, end);
307 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
308 *nextTokPtr = ptr + MINBPC(enc);
309 return tok;
310 }
311 break;
312 default:
313 ptr += MINBPC(enc);
314 break;
315 }
316 }
317 return XML_TOK_PARTIAL;
318 case BT_QUEST:
319 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
320 *nextTokPtr = ptr;
321 return XML_TOK_INVALID;
322 }
323 ptr += MINBPC(enc);
324 REQUIRE_CHAR(enc, ptr, end);
325 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
326 *nextTokPtr = ptr + MINBPC(enc);
327 return tok;
328 }
329 /* fall through */
330 default:
331 *nextTokPtr = ptr;
332 return XML_TOK_INVALID;
333 }
334 }
335 return XML_TOK_PARTIAL;
336 }
337
338 static int PTRCALL
PREFIX(scanCdataSection)339 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
340 const char **nextTokPtr) {
341 static const char CDATA_LSQB[]
342 = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
343 int i;
344 UNUSED_P(enc);
345 /* CDATA[ */
346 REQUIRE_CHARS(enc, ptr, end, 6);
347 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
348 if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
349 *nextTokPtr = ptr;
350 return XML_TOK_INVALID;
351 }
352 }
353 *nextTokPtr = ptr;
354 return XML_TOK_CDATA_SECT_OPEN;
355 }
356
357 static int PTRCALL
PREFIX(cdataSectionTok)358 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
359 const char **nextTokPtr) {
360 if (ptr >= end)
361 return XML_TOK_NONE;
362 if (MINBPC(enc) > 1) {
363 size_t n = end - ptr;
364 if (n & (MINBPC(enc) - 1)) {
365 n &= ~(MINBPC(enc) - 1);
366 if (n == 0)
367 return XML_TOK_PARTIAL;
368 end = ptr + n;
369 }
370 }
371 switch (BYTE_TYPE(enc, ptr)) {
372 case BT_RSQB:
373 ptr += MINBPC(enc);
374 REQUIRE_CHAR(enc, ptr, end);
375 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
376 break;
377 ptr += MINBPC(enc);
378 REQUIRE_CHAR(enc, ptr, end);
379 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
380 ptr -= MINBPC(enc);
381 break;
382 }
383 *nextTokPtr = ptr + MINBPC(enc);
384 return XML_TOK_CDATA_SECT_CLOSE;
385 case BT_CR:
386 ptr += MINBPC(enc);
387 REQUIRE_CHAR(enc, ptr, end);
388 if (BYTE_TYPE(enc, ptr) == BT_LF)
389 ptr += MINBPC(enc);
390 *nextTokPtr = ptr;
391 return XML_TOK_DATA_NEWLINE;
392 case BT_LF:
393 *nextTokPtr = ptr + MINBPC(enc);
394 return XML_TOK_DATA_NEWLINE;
395 INVALID_CASES(ptr, nextTokPtr)
396 default:
397 ptr += MINBPC(enc);
398 break;
399 }
400 while (HAS_CHAR(enc, ptr, end)) {
401 switch (BYTE_TYPE(enc, ptr)) {
402 # define LEAD_CASE(n) \
403 case BT_LEAD##n: \
404 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
405 *nextTokPtr = ptr; \
406 return XML_TOK_DATA_CHARS; \
407 } \
408 ptr += n; \
409 break;
410 LEAD_CASE(2)
411 LEAD_CASE(3)
412 LEAD_CASE(4)
413 # undef LEAD_CASE
414 case BT_NONXML:
415 case BT_MALFORM:
416 case BT_TRAIL:
417 case BT_CR:
418 case BT_LF:
419 case BT_RSQB:
420 *nextTokPtr = ptr;
421 return XML_TOK_DATA_CHARS;
422 default:
423 ptr += MINBPC(enc);
424 break;
425 }
426 }
427 *nextTokPtr = ptr;
428 return XML_TOK_DATA_CHARS;
429 }
430
431 /* ptr points to character following "</" */
432
433 static int PTRCALL
PREFIX(scanEndTag)434 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
435 const char **nextTokPtr) {
436 REQUIRE_CHAR(enc, ptr, end);
437 switch (BYTE_TYPE(enc, ptr)) {
438 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
439 default:
440 *nextTokPtr = ptr;
441 return XML_TOK_INVALID;
442 }
443 while (HAS_CHAR(enc, ptr, end)) {
444 switch (BYTE_TYPE(enc, ptr)) {
445 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
446 case BT_S:
447 case BT_CR:
448 case BT_LF:
449 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
450 switch (BYTE_TYPE(enc, ptr)) {
451 case BT_S:
452 case BT_CR:
453 case BT_LF:
454 break;
455 case BT_GT:
456 *nextTokPtr = ptr + MINBPC(enc);
457 return XML_TOK_END_TAG;
458 default:
459 *nextTokPtr = ptr;
460 return XML_TOK_INVALID;
461 }
462 }
463 return XML_TOK_PARTIAL;
464 # ifdef XML_NS
465 case BT_COLON:
466 /* no need to check qname syntax here,
467 since end-tag must match exactly */
468 ptr += MINBPC(enc);
469 break;
470 # endif
471 case BT_GT:
472 *nextTokPtr = ptr + MINBPC(enc);
473 return XML_TOK_END_TAG;
474 default:
475 *nextTokPtr = ptr;
476 return XML_TOK_INVALID;
477 }
478 }
479 return XML_TOK_PARTIAL;
480 }
481
482 /* ptr points to character following "&#X" */
483
484 static int PTRCALL
PREFIX(scanHexCharRef)485 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
486 const char **nextTokPtr) {
487 if (HAS_CHAR(enc, ptr, end)) {
488 switch (BYTE_TYPE(enc, ptr)) {
489 case BT_DIGIT:
490 case BT_HEX:
491 break;
492 default:
493 *nextTokPtr = ptr;
494 return XML_TOK_INVALID;
495 }
496 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
497 switch (BYTE_TYPE(enc, ptr)) {
498 case BT_DIGIT:
499 case BT_HEX:
500 break;
501 case BT_SEMI:
502 *nextTokPtr = ptr + MINBPC(enc);
503 return XML_TOK_CHAR_REF;
504 default:
505 *nextTokPtr = ptr;
506 return XML_TOK_INVALID;
507 }
508 }
509 }
510 return XML_TOK_PARTIAL;
511 }
512
513 /* ptr points to character following "&#" */
514
515 static int PTRCALL
PREFIX(scanCharRef)516 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
517 const char **nextTokPtr) {
518 if (HAS_CHAR(enc, ptr, end)) {
519 if (CHAR_MATCHES(enc, ptr, ASCII_x))
520 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
521 switch (BYTE_TYPE(enc, ptr)) {
522 case BT_DIGIT:
523 break;
524 default:
525 *nextTokPtr = ptr;
526 return XML_TOK_INVALID;
527 }
528 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
529 switch (BYTE_TYPE(enc, ptr)) {
530 case BT_DIGIT:
531 break;
532 case BT_SEMI:
533 *nextTokPtr = ptr + MINBPC(enc);
534 return XML_TOK_CHAR_REF;
535 default:
536 *nextTokPtr = ptr;
537 return XML_TOK_INVALID;
538 }
539 }
540 }
541 return XML_TOK_PARTIAL;
542 }
543
544 /* ptr points to character following "&" */
545
546 static int PTRCALL
PREFIX(scanRef)547 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
548 const char **nextTokPtr) {
549 REQUIRE_CHAR(enc, ptr, end);
550 switch (BYTE_TYPE(enc, ptr)) {
551 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
552 case BT_NUM:
553 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
554 default:
555 *nextTokPtr = ptr;
556 return XML_TOK_INVALID;
557 }
558 while (HAS_CHAR(enc, ptr, end)) {
559 switch (BYTE_TYPE(enc, ptr)) {
560 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
561 case BT_SEMI:
562 *nextTokPtr = ptr + MINBPC(enc);
563 return XML_TOK_ENTITY_REF;
564 default:
565 *nextTokPtr = ptr;
566 return XML_TOK_INVALID;
567 }
568 }
569 return XML_TOK_PARTIAL;
570 }
571
572 /* ptr points to character following first character of attribute name */
573
574 static int PTRCALL
PREFIX(scanAtts)575 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
576 const char **nextTokPtr) {
577 # ifdef XML_NS
578 int hadColon = 0;
579 # endif
580 while (HAS_CHAR(enc, ptr, end)) {
581 switch (BYTE_TYPE(enc, ptr)) {
582 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
583 # ifdef XML_NS
584 case BT_COLON:
585 if (hadColon) {
586 *nextTokPtr = ptr;
587 return XML_TOK_INVALID;
588 }
589 hadColon = 1;
590 ptr += MINBPC(enc);
591 REQUIRE_CHAR(enc, ptr, end);
592 switch (BYTE_TYPE(enc, ptr)) {
593 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
594 default:
595 *nextTokPtr = ptr;
596 return XML_TOK_INVALID;
597 }
598 break;
599 # endif
600 case BT_S:
601 case BT_CR:
602 case BT_LF:
603 for (;;) {
604 int t;
605
606 ptr += MINBPC(enc);
607 REQUIRE_CHAR(enc, ptr, end);
608 t = BYTE_TYPE(enc, ptr);
609 if (t == BT_EQUALS)
610 break;
611 switch (t) {
612 case BT_S:
613 case BT_LF:
614 case BT_CR:
615 break;
616 default:
617 *nextTokPtr = ptr;
618 return XML_TOK_INVALID;
619 }
620 }
621 /* fall through */
622 case BT_EQUALS: {
623 int open;
624 # ifdef XML_NS
625 hadColon = 0;
626 # endif
627 for (;;) {
628 ptr += MINBPC(enc);
629 REQUIRE_CHAR(enc, ptr, end);
630 open = BYTE_TYPE(enc, ptr);
631 if (open == BT_QUOT || open == BT_APOS)
632 break;
633 switch (open) {
634 case BT_S:
635 case BT_LF:
636 case BT_CR:
637 break;
638 default:
639 *nextTokPtr = ptr;
640 return XML_TOK_INVALID;
641 }
642 }
643 ptr += MINBPC(enc);
644 /* in attribute value */
645 for (;;) {
646 int t;
647 REQUIRE_CHAR(enc, ptr, end);
648 t = BYTE_TYPE(enc, ptr);
649 if (t == open)
650 break;
651 switch (t) {
652 INVALID_CASES(ptr, nextTokPtr)
653 case BT_AMP: {
654 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
655 if (tok <= 0) {
656 if (tok == XML_TOK_INVALID)
657 *nextTokPtr = ptr;
658 return tok;
659 }
660 break;
661 }
662 case BT_LT:
663 *nextTokPtr = ptr;
664 return XML_TOK_INVALID;
665 default:
666 ptr += MINBPC(enc);
667 break;
668 }
669 }
670 ptr += MINBPC(enc);
671 REQUIRE_CHAR(enc, ptr, end);
672 switch (BYTE_TYPE(enc, ptr)) {
673 case BT_S:
674 case BT_CR:
675 case BT_LF:
676 break;
677 case BT_SOL:
678 goto sol;
679 case BT_GT:
680 goto gt;
681 default:
682 *nextTokPtr = ptr;
683 return XML_TOK_INVALID;
684 }
685 /* ptr points to closing quote */
686 for (;;) {
687 ptr += MINBPC(enc);
688 REQUIRE_CHAR(enc, ptr, end);
689 switch (BYTE_TYPE(enc, ptr)) {
690 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
691 case BT_S:
692 case BT_CR:
693 case BT_LF:
694 continue;
695 case BT_GT:
696 gt:
697 *nextTokPtr = ptr + MINBPC(enc);
698 return XML_TOK_START_TAG_WITH_ATTS;
699 case BT_SOL:
700 sol:
701 ptr += MINBPC(enc);
702 REQUIRE_CHAR(enc, ptr, end);
703 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
704 *nextTokPtr = ptr;
705 return XML_TOK_INVALID;
706 }
707 *nextTokPtr = ptr + MINBPC(enc);
708 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
709 default:
710 *nextTokPtr = ptr;
711 return XML_TOK_INVALID;
712 }
713 break;
714 }
715 break;
716 }
717 default:
718 *nextTokPtr = ptr;
719 return XML_TOK_INVALID;
720 }
721 }
722 return XML_TOK_PARTIAL;
723 }
724
725 /* ptr points to character following "<" */
726
727 static int PTRCALL
PREFIX(scanLt)728 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
729 const char **nextTokPtr) {
730 # ifdef XML_NS
731 int hadColon;
732 # endif
733 REQUIRE_CHAR(enc, ptr, end);
734 switch (BYTE_TYPE(enc, ptr)) {
735 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
736 case BT_EXCL:
737 ptr += MINBPC(enc);
738 REQUIRE_CHAR(enc, ptr, end);
739 switch (BYTE_TYPE(enc, ptr)) {
740 case BT_MINUS:
741 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
742 case BT_LSQB:
743 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
744 }
745 *nextTokPtr = ptr;
746 return XML_TOK_INVALID;
747 case BT_QUEST:
748 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
749 case BT_SOL:
750 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
751 default:
752 *nextTokPtr = ptr;
753 return XML_TOK_INVALID;
754 }
755 # ifdef XML_NS
756 hadColon = 0;
757 # endif
758 /* we have a start-tag */
759 while (HAS_CHAR(enc, ptr, end)) {
760 switch (BYTE_TYPE(enc, ptr)) {
761 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
762 # ifdef XML_NS
763 case BT_COLON:
764 if (hadColon) {
765 *nextTokPtr = ptr;
766 return XML_TOK_INVALID;
767 }
768 hadColon = 1;
769 ptr += MINBPC(enc);
770 REQUIRE_CHAR(enc, ptr, end);
771 switch (BYTE_TYPE(enc, ptr)) {
772 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
773 default:
774 *nextTokPtr = ptr;
775 return XML_TOK_INVALID;
776 }
777 break;
778 # endif
779 case BT_S:
780 case BT_CR:
781 case BT_LF: {
782 ptr += MINBPC(enc);
783 while (HAS_CHAR(enc, ptr, end)) {
784 switch (BYTE_TYPE(enc, ptr)) {
785 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
786 case BT_GT:
787 goto gt;
788 case BT_SOL:
789 goto sol;
790 case BT_S:
791 case BT_CR:
792 case BT_LF:
793 ptr += MINBPC(enc);
794 continue;
795 default:
796 *nextTokPtr = ptr;
797 return XML_TOK_INVALID;
798 }
799 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
800 }
801 return XML_TOK_PARTIAL;
802 }
803 case BT_GT:
804 gt:
805 *nextTokPtr = ptr + MINBPC(enc);
806 return XML_TOK_START_TAG_NO_ATTS;
807 case BT_SOL:
808 sol:
809 ptr += MINBPC(enc);
810 REQUIRE_CHAR(enc, ptr, end);
811 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
812 *nextTokPtr = ptr;
813 return XML_TOK_INVALID;
814 }
815 *nextTokPtr = ptr + MINBPC(enc);
816 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
817 default:
818 *nextTokPtr = ptr;
819 return XML_TOK_INVALID;
820 }
821 }
822 return XML_TOK_PARTIAL;
823 }
824
825 static int PTRCALL
PREFIX(contentTok)826 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
827 const char **nextTokPtr) {
828 if (ptr >= end)
829 return XML_TOK_NONE;
830 if (MINBPC(enc) > 1) {
831 size_t n = end - ptr;
832 if (n & (MINBPC(enc) - 1)) {
833 n &= ~(MINBPC(enc) - 1);
834 if (n == 0)
835 return XML_TOK_PARTIAL;
836 end = ptr + n;
837 }
838 }
839 switch (BYTE_TYPE(enc, ptr)) {
840 case BT_LT:
841 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
842 case BT_AMP:
843 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
844 case BT_CR:
845 ptr += MINBPC(enc);
846 if (! HAS_CHAR(enc, ptr, end))
847 return XML_TOK_TRAILING_CR;
848 if (BYTE_TYPE(enc, ptr) == BT_LF)
849 ptr += MINBPC(enc);
850 *nextTokPtr = ptr;
851 return XML_TOK_DATA_NEWLINE;
852 case BT_LF:
853 *nextTokPtr = ptr + MINBPC(enc);
854 return XML_TOK_DATA_NEWLINE;
855 case BT_RSQB:
856 ptr += MINBPC(enc);
857 if (! HAS_CHAR(enc, ptr, end))
858 return XML_TOK_TRAILING_RSQB;
859 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
860 break;
861 ptr += MINBPC(enc);
862 if (! HAS_CHAR(enc, ptr, end))
863 return XML_TOK_TRAILING_RSQB;
864 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
865 ptr -= MINBPC(enc);
866 break;
867 }
868 *nextTokPtr = ptr;
869 return XML_TOK_INVALID;
870 INVALID_CASES(ptr, nextTokPtr)
871 default:
872 ptr += MINBPC(enc);
873 break;
874 }
875 while (HAS_CHAR(enc, ptr, end)) {
876 switch (BYTE_TYPE(enc, ptr)) {
877 # define LEAD_CASE(n) \
878 case BT_LEAD##n: \
879 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
880 *nextTokPtr = ptr; \
881 return XML_TOK_DATA_CHARS; \
882 } \
883 ptr += n; \
884 break;
885 LEAD_CASE(2)
886 LEAD_CASE(3)
887 LEAD_CASE(4)
888 # undef LEAD_CASE
889 case BT_RSQB:
890 if (HAS_CHARS(enc, ptr, end, 2)) {
891 if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
892 ptr += MINBPC(enc);
893 break;
894 }
895 if (HAS_CHARS(enc, ptr, end, 3)) {
896 if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
897 ptr += MINBPC(enc);
898 break;
899 }
900 *nextTokPtr = ptr + 2 * MINBPC(enc);
901 return XML_TOK_INVALID;
902 }
903 }
904 /* fall through */
905 case BT_AMP:
906 case BT_LT:
907 case BT_NONXML:
908 case BT_MALFORM:
909 case BT_TRAIL:
910 case BT_CR:
911 case BT_LF:
912 *nextTokPtr = ptr;
913 return XML_TOK_DATA_CHARS;
914 default:
915 ptr += MINBPC(enc);
916 break;
917 }
918 }
919 *nextTokPtr = ptr;
920 return XML_TOK_DATA_CHARS;
921 }
922
923 /* ptr points to character following "%" */
924
925 static int PTRCALL
PREFIX(scanPercent)926 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
927 const char **nextTokPtr) {
928 REQUIRE_CHAR(enc, ptr, end);
929 switch (BYTE_TYPE(enc, ptr)) {
930 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
931 case BT_S:
932 case BT_LF:
933 case BT_CR:
934 case BT_PERCNT:
935 *nextTokPtr = ptr;
936 return XML_TOK_PERCENT;
937 default:
938 *nextTokPtr = ptr;
939 return XML_TOK_INVALID;
940 }
941 while (HAS_CHAR(enc, ptr, end)) {
942 switch (BYTE_TYPE(enc, ptr)) {
943 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
944 case BT_SEMI:
945 *nextTokPtr = ptr + MINBPC(enc);
946 return XML_TOK_PARAM_ENTITY_REF;
947 default:
948 *nextTokPtr = ptr;
949 return XML_TOK_INVALID;
950 }
951 }
952 return XML_TOK_PARTIAL;
953 }
954
955 static int PTRCALL
PREFIX(scanPoundName)956 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
957 const char **nextTokPtr) {
958 REQUIRE_CHAR(enc, ptr, end);
959 switch (BYTE_TYPE(enc, ptr)) {
960 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
961 default:
962 *nextTokPtr = ptr;
963 return XML_TOK_INVALID;
964 }
965 while (HAS_CHAR(enc, ptr, end)) {
966 switch (BYTE_TYPE(enc, ptr)) {
967 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
968 case BT_CR:
969 case BT_LF:
970 case BT_S:
971 case BT_RPAR:
972 case BT_GT:
973 case BT_PERCNT:
974 case BT_VERBAR:
975 *nextTokPtr = ptr;
976 return XML_TOK_POUND_NAME;
977 default:
978 *nextTokPtr = ptr;
979 return XML_TOK_INVALID;
980 }
981 }
982 return -XML_TOK_POUND_NAME;
983 }
984
985 static int PTRCALL
PREFIX(scanLit)986 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
987 const char **nextTokPtr) {
988 while (HAS_CHAR(enc, ptr, end)) {
989 int t = BYTE_TYPE(enc, ptr);
990 switch (t) {
991 INVALID_CASES(ptr, nextTokPtr)
992 case BT_QUOT:
993 case BT_APOS:
994 ptr += MINBPC(enc);
995 if (t != open)
996 break;
997 if (! HAS_CHAR(enc, ptr, end))
998 return -XML_TOK_LITERAL;
999 *nextTokPtr = ptr;
1000 switch (BYTE_TYPE(enc, ptr)) {
1001 case BT_S:
1002 case BT_CR:
1003 case BT_LF:
1004 case BT_GT:
1005 case BT_PERCNT:
1006 case BT_LSQB:
1007 return XML_TOK_LITERAL;
1008 default:
1009 return XML_TOK_INVALID;
1010 }
1011 default:
1012 ptr += MINBPC(enc);
1013 break;
1014 }
1015 }
1016 return XML_TOK_PARTIAL;
1017 }
1018
1019 static int PTRCALL
PREFIX(prologTok)1020 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1021 const char **nextTokPtr) {
1022 int tok;
1023 if (ptr >= end)
1024 return XML_TOK_NONE;
1025 if (MINBPC(enc) > 1) {
1026 size_t n = end - ptr;
1027 if (n & (MINBPC(enc) - 1)) {
1028 n &= ~(MINBPC(enc) - 1);
1029 if (n == 0)
1030 return XML_TOK_PARTIAL;
1031 end = ptr + n;
1032 }
1033 }
1034 switch (BYTE_TYPE(enc, ptr)) {
1035 case BT_QUOT:
1036 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1037 case BT_APOS:
1038 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1039 case BT_LT: {
1040 ptr += MINBPC(enc);
1041 REQUIRE_CHAR(enc, ptr, end);
1042 switch (BYTE_TYPE(enc, ptr)) {
1043 case BT_EXCL:
1044 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1045 case BT_QUEST:
1046 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1047 case BT_NMSTRT:
1048 case BT_HEX:
1049 case BT_NONASCII:
1050 case BT_LEAD2:
1051 case BT_LEAD3:
1052 case BT_LEAD4:
1053 *nextTokPtr = ptr - MINBPC(enc);
1054 return XML_TOK_INSTANCE_START;
1055 }
1056 *nextTokPtr = ptr;
1057 return XML_TOK_INVALID;
1058 }
1059 case BT_CR:
1060 if (ptr + MINBPC(enc) == end) {
1061 *nextTokPtr = end;
1062 /* indicate that this might be part of a CR/LF pair */
1063 return -XML_TOK_PROLOG_S;
1064 }
1065 /* fall through */
1066 case BT_S:
1067 case BT_LF:
1068 for (;;) {
1069 ptr += MINBPC(enc);
1070 if (! HAS_CHAR(enc, ptr, end))
1071 break;
1072 switch (BYTE_TYPE(enc, ptr)) {
1073 case BT_S:
1074 case BT_LF:
1075 break;
1076 case BT_CR:
1077 /* don't split CR/LF pair */
1078 if (ptr + MINBPC(enc) != end)
1079 break;
1080 /* fall through */
1081 default:
1082 *nextTokPtr = ptr;
1083 return XML_TOK_PROLOG_S;
1084 }
1085 }
1086 *nextTokPtr = ptr;
1087 return XML_TOK_PROLOG_S;
1088 case BT_PERCNT:
1089 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1090 case BT_COMMA:
1091 *nextTokPtr = ptr + MINBPC(enc);
1092 return XML_TOK_COMMA;
1093 case BT_LSQB:
1094 *nextTokPtr = ptr + MINBPC(enc);
1095 return XML_TOK_OPEN_BRACKET;
1096 case BT_RSQB:
1097 ptr += MINBPC(enc);
1098 if (! HAS_CHAR(enc, ptr, end))
1099 return -XML_TOK_CLOSE_BRACKET;
1100 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1101 REQUIRE_CHARS(enc, ptr, end, 2);
1102 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1103 *nextTokPtr = ptr + 2 * MINBPC(enc);
1104 return XML_TOK_COND_SECT_CLOSE;
1105 }
1106 }
1107 *nextTokPtr = ptr;
1108 return XML_TOK_CLOSE_BRACKET;
1109 case BT_LPAR:
1110 *nextTokPtr = ptr + MINBPC(enc);
1111 return XML_TOK_OPEN_PAREN;
1112 case BT_RPAR:
1113 ptr += MINBPC(enc);
1114 if (! HAS_CHAR(enc, ptr, end))
1115 return -XML_TOK_CLOSE_PAREN;
1116 switch (BYTE_TYPE(enc, ptr)) {
1117 case BT_AST:
1118 *nextTokPtr = ptr + MINBPC(enc);
1119 return XML_TOK_CLOSE_PAREN_ASTERISK;
1120 case BT_QUEST:
1121 *nextTokPtr = ptr + MINBPC(enc);
1122 return XML_TOK_CLOSE_PAREN_QUESTION;
1123 case BT_PLUS:
1124 *nextTokPtr = ptr + MINBPC(enc);
1125 return XML_TOK_CLOSE_PAREN_PLUS;
1126 case BT_CR:
1127 case BT_LF:
1128 case BT_S:
1129 case BT_GT:
1130 case BT_COMMA:
1131 case BT_VERBAR:
1132 case BT_RPAR:
1133 *nextTokPtr = ptr;
1134 return XML_TOK_CLOSE_PAREN;
1135 }
1136 *nextTokPtr = ptr;
1137 return XML_TOK_INVALID;
1138 case BT_VERBAR:
1139 *nextTokPtr = ptr + MINBPC(enc);
1140 return XML_TOK_OR;
1141 case BT_GT:
1142 *nextTokPtr = ptr + MINBPC(enc);
1143 return XML_TOK_DECL_CLOSE;
1144 case BT_NUM:
1145 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1146 # define LEAD_CASE(n) \
1147 case BT_LEAD##n: \
1148 if (end - ptr < n) \
1149 return XML_TOK_PARTIAL_CHAR; \
1150 if (IS_INVALID_CHAR(enc, ptr, n)) { \
1151 *nextTokPtr = ptr; \
1152 return XML_TOK_INVALID; \
1153 } \
1154 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1155 ptr += n; \
1156 tok = XML_TOK_NAME; \
1157 break; \
1158 } \
1159 if (IS_NAME_CHAR(enc, ptr, n)) { \
1160 ptr += n; \
1161 tok = XML_TOK_NMTOKEN; \
1162 break; \
1163 } \
1164 *nextTokPtr = ptr; \
1165 return XML_TOK_INVALID;
1166 LEAD_CASE(2)
1167 LEAD_CASE(3)
1168 LEAD_CASE(4)
1169 # undef LEAD_CASE
1170 case BT_NMSTRT:
1171 case BT_HEX:
1172 tok = XML_TOK_NAME;
1173 ptr += MINBPC(enc);
1174 break;
1175 case BT_DIGIT:
1176 case BT_NAME:
1177 case BT_MINUS:
1178 # ifdef XML_NS
1179 case BT_COLON:
1180 # endif
1181 tok = XML_TOK_NMTOKEN;
1182 ptr += MINBPC(enc);
1183 break;
1184 case BT_NONASCII:
1185 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1186 ptr += MINBPC(enc);
1187 tok = XML_TOK_NAME;
1188 break;
1189 }
1190 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1191 ptr += MINBPC(enc);
1192 tok = XML_TOK_NMTOKEN;
1193 break;
1194 }
1195 /* fall through */
1196 default:
1197 *nextTokPtr = ptr;
1198 return XML_TOK_INVALID;
1199 }
1200 while (HAS_CHAR(enc, ptr, end)) {
1201 switch (BYTE_TYPE(enc, ptr)) {
1202 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1203 case BT_GT:
1204 case BT_RPAR:
1205 case BT_COMMA:
1206 case BT_VERBAR:
1207 case BT_LSQB:
1208 case BT_PERCNT:
1209 case BT_S:
1210 case BT_CR:
1211 case BT_LF:
1212 *nextTokPtr = ptr;
1213 return tok;
1214 # ifdef XML_NS
1215 case BT_COLON:
1216 ptr += MINBPC(enc);
1217 switch (tok) {
1218 case XML_TOK_NAME:
1219 REQUIRE_CHAR(enc, ptr, end);
1220 tok = XML_TOK_PREFIXED_NAME;
1221 switch (BYTE_TYPE(enc, ptr)) {
1222 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1223 default:
1224 tok = XML_TOK_NMTOKEN;
1225 break;
1226 }
1227 break;
1228 case XML_TOK_PREFIXED_NAME:
1229 tok = XML_TOK_NMTOKEN;
1230 break;
1231 }
1232 break;
1233 # endif
1234 case BT_PLUS:
1235 if (tok == XML_TOK_NMTOKEN) {
1236 *nextTokPtr = ptr;
1237 return XML_TOK_INVALID;
1238 }
1239 *nextTokPtr = ptr + MINBPC(enc);
1240 return XML_TOK_NAME_PLUS;
1241 case BT_AST:
1242 if (tok == XML_TOK_NMTOKEN) {
1243 *nextTokPtr = ptr;
1244 return XML_TOK_INVALID;
1245 }
1246 *nextTokPtr = ptr + MINBPC(enc);
1247 return XML_TOK_NAME_ASTERISK;
1248 case BT_QUEST:
1249 if (tok == XML_TOK_NMTOKEN) {
1250 *nextTokPtr = ptr;
1251 return XML_TOK_INVALID;
1252 }
1253 *nextTokPtr = ptr + MINBPC(enc);
1254 return XML_TOK_NAME_QUESTION;
1255 default:
1256 *nextTokPtr = ptr;
1257 return XML_TOK_INVALID;
1258 }
1259 }
1260 return -tok;
1261 }
1262
1263 static int PTRCALL
PREFIX(attributeValueTok)1264 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1265 const char **nextTokPtr) {
1266 const char *start;
1267 if (ptr >= end)
1268 return XML_TOK_NONE;
1269 else if (! HAS_CHAR(enc, ptr, end)) {
1270 /* This line cannot be executed. The incoming data has already
1271 * been tokenized once, so incomplete characters like this have
1272 * already been eliminated from the input. Retaining the paranoia
1273 * check is still valuable, however.
1274 */
1275 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1276 }
1277 start = ptr;
1278 while (HAS_CHAR(enc, ptr, end)) {
1279 switch (BYTE_TYPE(enc, ptr)) {
1280 # define LEAD_CASE(n) \
1281 case BT_LEAD##n: \
1282 ptr += n; /* NOTE: The encoding has already been validated. */ \
1283 break;
1284 LEAD_CASE(2)
1285 LEAD_CASE(3)
1286 LEAD_CASE(4)
1287 # undef LEAD_CASE
1288 case BT_AMP:
1289 if (ptr == start)
1290 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1291 *nextTokPtr = ptr;
1292 return XML_TOK_DATA_CHARS;
1293 case BT_LT:
1294 /* this is for inside entity references */
1295 *nextTokPtr = ptr;
1296 return XML_TOK_INVALID;
1297 case BT_LF:
1298 if (ptr == start) {
1299 *nextTokPtr = ptr + MINBPC(enc);
1300 return XML_TOK_DATA_NEWLINE;
1301 }
1302 *nextTokPtr = ptr;
1303 return XML_TOK_DATA_CHARS;
1304 case BT_CR:
1305 if (ptr == start) {
1306 ptr += MINBPC(enc);
1307 if (! HAS_CHAR(enc, ptr, end))
1308 return XML_TOK_TRAILING_CR;
1309 if (BYTE_TYPE(enc, ptr) == BT_LF)
1310 ptr += MINBPC(enc);
1311 *nextTokPtr = ptr;
1312 return XML_TOK_DATA_NEWLINE;
1313 }
1314 *nextTokPtr = ptr;
1315 return XML_TOK_DATA_CHARS;
1316 case BT_S:
1317 if (ptr == start) {
1318 *nextTokPtr = ptr + MINBPC(enc);
1319 return XML_TOK_ATTRIBUTE_VALUE_S;
1320 }
1321 *nextTokPtr = ptr;
1322 return XML_TOK_DATA_CHARS;
1323 default:
1324 ptr += MINBPC(enc);
1325 break;
1326 }
1327 }
1328 *nextTokPtr = ptr;
1329 return XML_TOK_DATA_CHARS;
1330 }
1331
1332 static int PTRCALL
PREFIX(entityValueTok)1333 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1334 const char **nextTokPtr) {
1335 const char *start;
1336 if (ptr >= end)
1337 return XML_TOK_NONE;
1338 else if (! HAS_CHAR(enc, ptr, end)) {
1339 /* This line cannot be executed. The incoming data has already
1340 * been tokenized once, so incomplete characters like this have
1341 * already been eliminated from the input. Retaining the paranoia
1342 * check is still valuable, however.
1343 */
1344 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1345 }
1346 start = ptr;
1347 while (HAS_CHAR(enc, ptr, end)) {
1348 switch (BYTE_TYPE(enc, ptr)) {
1349 # define LEAD_CASE(n) \
1350 case BT_LEAD##n: \
1351 ptr += n; /* NOTE: The encoding has already been validated. */ \
1352 break;
1353 LEAD_CASE(2)
1354 LEAD_CASE(3)
1355 LEAD_CASE(4)
1356 # undef LEAD_CASE
1357 case BT_AMP:
1358 if (ptr == start)
1359 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1360 *nextTokPtr = ptr;
1361 return XML_TOK_DATA_CHARS;
1362 case BT_PERCNT:
1363 if (ptr == start) {
1364 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1365 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1366 }
1367 *nextTokPtr = ptr;
1368 return XML_TOK_DATA_CHARS;
1369 case BT_LF:
1370 if (ptr == start) {
1371 *nextTokPtr = ptr + MINBPC(enc);
1372 return XML_TOK_DATA_NEWLINE;
1373 }
1374 *nextTokPtr = ptr;
1375 return XML_TOK_DATA_CHARS;
1376 case BT_CR:
1377 if (ptr == start) {
1378 ptr += MINBPC(enc);
1379 if (! HAS_CHAR(enc, ptr, end))
1380 return XML_TOK_TRAILING_CR;
1381 if (BYTE_TYPE(enc, ptr) == BT_LF)
1382 ptr += MINBPC(enc);
1383 *nextTokPtr = ptr;
1384 return XML_TOK_DATA_NEWLINE;
1385 }
1386 *nextTokPtr = ptr;
1387 return XML_TOK_DATA_CHARS;
1388 default:
1389 ptr += MINBPC(enc);
1390 break;
1391 }
1392 }
1393 *nextTokPtr = ptr;
1394 return XML_TOK_DATA_CHARS;
1395 }
1396
1397 # ifdef XML_DTD
1398
1399 static int PTRCALL
PREFIX(ignoreSectionTok)1400 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1401 const char **nextTokPtr) {
1402 int level = 0;
1403 if (MINBPC(enc) > 1) {
1404 size_t n = end - ptr;
1405 if (n & (MINBPC(enc) - 1)) {
1406 n &= ~(MINBPC(enc) - 1);
1407 end = ptr + n;
1408 }
1409 }
1410 while (HAS_CHAR(enc, ptr, end)) {
1411 switch (BYTE_TYPE(enc, ptr)) {
1412 INVALID_CASES(ptr, nextTokPtr)
1413 case BT_LT:
1414 ptr += MINBPC(enc);
1415 REQUIRE_CHAR(enc, ptr, end);
1416 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1417 ptr += MINBPC(enc);
1418 REQUIRE_CHAR(enc, ptr, end);
1419 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1420 ++level;
1421 ptr += MINBPC(enc);
1422 }
1423 }
1424 break;
1425 case BT_RSQB:
1426 ptr += MINBPC(enc);
1427 REQUIRE_CHAR(enc, ptr, end);
1428 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1429 ptr += MINBPC(enc);
1430 REQUIRE_CHAR(enc, ptr, end);
1431 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1432 ptr += MINBPC(enc);
1433 if (level == 0) {
1434 *nextTokPtr = ptr;
1435 return XML_TOK_IGNORE_SECT;
1436 }
1437 --level;
1438 }
1439 }
1440 break;
1441 default:
1442 ptr += MINBPC(enc);
1443 break;
1444 }
1445 }
1446 return XML_TOK_PARTIAL;
1447 }
1448
1449 # endif /* XML_DTD */
1450
1451 static int PTRCALL
PREFIX(isPublicId)1452 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1453 const char **badPtr) {
1454 ptr += MINBPC(enc);
1455 end -= MINBPC(enc);
1456 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1457 switch (BYTE_TYPE(enc, ptr)) {
1458 case BT_DIGIT:
1459 case BT_HEX:
1460 case BT_MINUS:
1461 case BT_APOS:
1462 case BT_LPAR:
1463 case BT_RPAR:
1464 case BT_PLUS:
1465 case BT_COMMA:
1466 case BT_SOL:
1467 case BT_EQUALS:
1468 case BT_QUEST:
1469 case BT_CR:
1470 case BT_LF:
1471 case BT_SEMI:
1472 case BT_EXCL:
1473 case BT_AST:
1474 case BT_PERCNT:
1475 case BT_NUM:
1476 # ifdef XML_NS
1477 case BT_COLON:
1478 # endif
1479 break;
1480 case BT_S:
1481 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1482 *badPtr = ptr;
1483 return 0;
1484 }
1485 break;
1486 case BT_NAME:
1487 case BT_NMSTRT:
1488 if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1489 break;
1490 /* fall through */
1491 default:
1492 switch (BYTE_TO_ASCII(enc, ptr)) {
1493 case 0x24: /* $ */
1494 case 0x40: /* @ */
1495 break;
1496 default:
1497 *badPtr = ptr;
1498 return 0;
1499 }
1500 break;
1501 }
1502 }
1503 return 1;
1504 }
1505
1506 /* This must only be called for a well-formed start-tag or empty
1507 element tag. Returns the number of attributes. Pointers to the
1508 first attsMax attributes are stored in atts.
1509 */
1510
1511 static int PTRCALL
PREFIX(getAtts)1512 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1513 ATTRIBUTE *atts) {
1514 enum { other, inName, inValue } state = inName;
1515 int nAtts = 0;
1516 int open = 0; /* defined when state == inValue;
1517 initialization just to shut up compilers */
1518
1519 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1520 switch (BYTE_TYPE(enc, ptr)) {
1521 # define START_NAME \
1522 if (state == other) { \
1523 if (nAtts < attsMax) { \
1524 atts[nAtts].name = ptr; \
1525 atts[nAtts].normalized = 1; \
1526 } \
1527 state = inName; \
1528 }
1529 # define LEAD_CASE(n) \
1530 case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \
1531 START_NAME ptr += (n - MINBPC(enc)); \
1532 break;
1533 LEAD_CASE(2)
1534 LEAD_CASE(3)
1535 LEAD_CASE(4)
1536 # undef LEAD_CASE
1537 case BT_NONASCII:
1538 case BT_NMSTRT:
1539 case BT_HEX:
1540 START_NAME
1541 break;
1542 # undef START_NAME
1543 case BT_QUOT:
1544 if (state != inValue) {
1545 if (nAtts < attsMax)
1546 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1547 state = inValue;
1548 open = BT_QUOT;
1549 } else if (open == BT_QUOT) {
1550 state = other;
1551 if (nAtts < attsMax)
1552 atts[nAtts].valueEnd = ptr;
1553 nAtts++;
1554 }
1555 break;
1556 case BT_APOS:
1557 if (state != inValue) {
1558 if (nAtts < attsMax)
1559 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1560 state = inValue;
1561 open = BT_APOS;
1562 } else if (open == BT_APOS) {
1563 state = other;
1564 if (nAtts < attsMax)
1565 atts[nAtts].valueEnd = ptr;
1566 nAtts++;
1567 }
1568 break;
1569 case BT_AMP:
1570 if (nAtts < attsMax)
1571 atts[nAtts].normalized = 0;
1572 break;
1573 case BT_S:
1574 if (state == inName)
1575 state = other;
1576 else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1577 && (ptr == atts[nAtts].valuePtr
1578 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1579 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1580 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1581 atts[nAtts].normalized = 0;
1582 break;
1583 case BT_CR:
1584 case BT_LF:
1585 /* This case ensures that the first attribute name is counted
1586 Apart from that we could just change state on the quote. */
1587 if (state == inName)
1588 state = other;
1589 else if (state == inValue && nAtts < attsMax)
1590 atts[nAtts].normalized = 0;
1591 break;
1592 case BT_GT:
1593 case BT_SOL:
1594 if (state != inValue)
1595 return nAtts;
1596 break;
1597 default:
1598 break;
1599 }
1600 }
1601 /* not reached */
1602 }
1603
1604 static int PTRFASTCALL
PREFIX(charRefNumber)1605 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1606 int result = 0;
1607 /* skip &# */
1608 UNUSED_P(enc);
1609 ptr += 2 * MINBPC(enc);
1610 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1611 for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1612 ptr += MINBPC(enc)) {
1613 int c = BYTE_TO_ASCII(enc, ptr);
1614 switch (c) {
1615 case ASCII_0:
1616 case ASCII_1:
1617 case ASCII_2:
1618 case ASCII_3:
1619 case ASCII_4:
1620 case ASCII_5:
1621 case ASCII_6:
1622 case ASCII_7:
1623 case ASCII_8:
1624 case ASCII_9:
1625 result <<= 4;
1626 result |= (c - ASCII_0);
1627 break;
1628 case ASCII_A:
1629 case ASCII_B:
1630 case ASCII_C:
1631 case ASCII_D:
1632 case ASCII_E:
1633 case ASCII_F:
1634 result <<= 4;
1635 result += 10 + (c - ASCII_A);
1636 break;
1637 case ASCII_a:
1638 case ASCII_b:
1639 case ASCII_c:
1640 case ASCII_d:
1641 case ASCII_e:
1642 case ASCII_f:
1643 result <<= 4;
1644 result += 10 + (c - ASCII_a);
1645 break;
1646 }
1647 if (result >= 0x110000)
1648 return -1;
1649 }
1650 } else {
1651 for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1652 int c = BYTE_TO_ASCII(enc, ptr);
1653 result *= 10;
1654 result += (c - ASCII_0);
1655 if (result >= 0x110000)
1656 return -1;
1657 }
1658 }
1659 return checkCharRefNumber(result);
1660 }
1661
1662 static int PTRCALL
PREFIX(predefinedEntityName)1663 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1664 const char *end) {
1665 UNUSED_P(enc);
1666 switch ((end - ptr) / MINBPC(enc)) {
1667 case 2:
1668 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1669 switch (BYTE_TO_ASCII(enc, ptr)) {
1670 case ASCII_l:
1671 return ASCII_LT;
1672 case ASCII_g:
1673 return ASCII_GT;
1674 }
1675 }
1676 break;
1677 case 3:
1678 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1679 ptr += MINBPC(enc);
1680 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1681 ptr += MINBPC(enc);
1682 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1683 return ASCII_AMP;
1684 }
1685 }
1686 break;
1687 case 4:
1688 switch (BYTE_TO_ASCII(enc, ptr)) {
1689 case ASCII_q:
1690 ptr += MINBPC(enc);
1691 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1692 ptr += MINBPC(enc);
1693 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1694 ptr += MINBPC(enc);
1695 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1696 return ASCII_QUOT;
1697 }
1698 }
1699 break;
1700 case ASCII_a:
1701 ptr += MINBPC(enc);
1702 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1703 ptr += MINBPC(enc);
1704 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1705 ptr += MINBPC(enc);
1706 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1707 return ASCII_APOS;
1708 }
1709 }
1710 break;
1711 }
1712 }
1713 return 0;
1714 }
1715
1716 static int PTRCALL
PREFIX(nameMatchesAscii)1717 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1718 const char *end1, const char *ptr2) {
1719 UNUSED_P(enc);
1720 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1721 if (end1 - ptr1 < MINBPC(enc)) {
1722 /* This line cannot be executed. The incoming data has already
1723 * been tokenized once, so incomplete characters like this have
1724 * already been eliminated from the input. Retaining the
1725 * paranoia check is still valuable, however.
1726 */
1727 return 0; /* LCOV_EXCL_LINE */
1728 }
1729 if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1730 return 0;
1731 }
1732 return ptr1 == end1;
1733 }
1734
1735 static int PTRFASTCALL
PREFIX(nameLength)1736 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1737 const char *start = ptr;
1738 for (;;) {
1739 switch (BYTE_TYPE(enc, ptr)) {
1740 # define LEAD_CASE(n) \
1741 case BT_LEAD##n: \
1742 ptr += n; /* NOTE: The encoding has already been validated. */ \
1743 break;
1744 LEAD_CASE(2)
1745 LEAD_CASE(3)
1746 LEAD_CASE(4)
1747 # undef LEAD_CASE
1748 case BT_NONASCII:
1749 case BT_NMSTRT:
1750 # ifdef XML_NS
1751 case BT_COLON:
1752 # endif
1753 case BT_HEX:
1754 case BT_DIGIT:
1755 case BT_NAME:
1756 case BT_MINUS:
1757 ptr += MINBPC(enc);
1758 break;
1759 default:
1760 return (int)(ptr - start);
1761 }
1762 }
1763 }
1764
1765 static const char *PTRFASTCALL
PREFIX(skipS)1766 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1767 for (;;) {
1768 switch (BYTE_TYPE(enc, ptr)) {
1769 case BT_LF:
1770 case BT_CR:
1771 case BT_S:
1772 ptr += MINBPC(enc);
1773 break;
1774 default:
1775 return ptr;
1776 }
1777 }
1778 }
1779
1780 static void PTRCALL
PREFIX(updatePosition)1781 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1782 POSITION *pos) {
1783 while (HAS_CHAR(enc, ptr, end)) {
1784 switch (BYTE_TYPE(enc, ptr)) {
1785 # define LEAD_CASE(n) \
1786 case BT_LEAD##n: \
1787 ptr += n; /* NOTE: The encoding has already been validated. */ \
1788 pos->columnNumber++; \
1789 break;
1790 LEAD_CASE(2)
1791 LEAD_CASE(3)
1792 LEAD_CASE(4)
1793 # undef LEAD_CASE
1794 case BT_LF:
1795 pos->columnNumber = 0;
1796 pos->lineNumber++;
1797 ptr += MINBPC(enc);
1798 break;
1799 case BT_CR:
1800 pos->lineNumber++;
1801 ptr += MINBPC(enc);
1802 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1803 ptr += MINBPC(enc);
1804 pos->columnNumber = 0;
1805 break;
1806 default:
1807 ptr += MINBPC(enc);
1808 pos->columnNumber++;
1809 break;
1810 }
1811 }
1812 }
1813
1814 # undef DO_LEAD_CASE
1815 # undef MULTIBYTE_CASES
1816 # undef INVALID_CASES
1817 # undef CHECK_NAME_CASE
1818 # undef CHECK_NAME_CASES
1819 # undef CHECK_NMSTRT_CASE
1820 # undef CHECK_NMSTRT_CASES
1821
1822 #endif /* XML_TOK_IMPL_C */
1823
1824 #endif /* LV_USE_XML */
1825
1826