1 /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2002      Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13    Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
14    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
15    Copyright (c) 2018      Benjamin Peterson <benjamin@python.org>
16    Copyright (c) 2018      Anton Maklakov <antmak.pub@gmail.com>
17    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
18    Copyright (c) 2020      Boris Kolpackov <boris@codesynthesis.com>
19    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
20    Licensed under the MIT license:
21 
22    Permission is  hereby granted,  free of charge,  to any  person obtaining
23    a  copy  of  this  software   and  associated  documentation  files  (the
24    "Software"),  to  deal in  the  Software  without restriction,  including
25    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
26    distribute, sublicense, and/or sell copies of the Software, and to permit
27    persons  to whom  the Software  is  furnished to  do so,  subject to  the
28    following conditions:
29 
30    The above copyright  notice and this permission notice  shall be included
31    in all copies or substantial portions of the Software.
32 
33    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
34    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
35    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
36    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
37    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
38    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
39    USE OR OTHER DEALINGS IN THE SOFTWARE.
40 */
41 
42 #include "../../lv_conf_internal.h"
43 #if LV_USE_XML
44 
45 #ifdef XML_TOK_IMPL_C
46 
47 #  ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
48 #    define IS_INVALID_CHAR(enc, ptr, n) (0)
49 #  endif
50 
51 #  define INVALID_LEAD_CASE(n, ptr, nextTokPtr)                                \
52   case BT_LEAD##n:                                                             \
53     if (end - ptr < n)                                                         \
54       return XML_TOK_PARTIAL_CHAR;                                             \
55     if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
56       *(nextTokPtr) = (ptr);                                                   \
57       return XML_TOK_INVALID;                                                  \
58     }                                                                          \
59     ptr += n;                                                                  \
60     break;
61 
62 #  define INVALID_CASES(ptr, nextTokPtr)                                       \
63     INVALID_LEAD_CASE(2, ptr, nextTokPtr)                                      \
64     INVALID_LEAD_CASE(3, ptr, nextTokPtr)                                      \
65     INVALID_LEAD_CASE(4, ptr, nextTokPtr)                                      \
66   case BT_NONXML:                                                              \
67   case BT_MALFORM:                                                             \
68   case BT_TRAIL:                                                               \
69     *(nextTokPtr) = (ptr);                                                     \
70     return XML_TOK_INVALID;
71 
72 #  define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr)                        \
73   case BT_LEAD##n:                                                             \
74     if (end - ptr < n)                                                         \
75       return XML_TOK_PARTIAL_CHAR;                                             \
76     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) {         \
77       *nextTokPtr = ptr;                                                       \
78       return XML_TOK_INVALID;                                                  \
79     }                                                                          \
80     ptr += n;                                                                  \
81     break;
82 
83 #  define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)                          \
84   case BT_NONASCII:                                                            \
85     if (! IS_NAME_CHAR_MINBPC(enc, ptr)) {                                     \
86       *nextTokPtr = ptr;                                                       \
87       return XML_TOK_INVALID;                                                  \
88     }                                                                          \
89     /* fall through */                                                         \
90   case BT_NMSTRT:                                                              \
91   case BT_HEX:                                                                 \
92   case BT_DIGIT:                                                               \
93   case BT_NAME:                                                                \
94   case BT_MINUS:                                                               \
95     ptr += MINBPC(enc);                                                        \
96     break;                                                                     \
97     CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr)                              \
98     CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr)                              \
99     CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
100 
101 #  define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr)                      \
102   case BT_LEAD##n:                                                             \
103     if ((end) - (ptr) < (n))                                                   \
104       return XML_TOK_PARTIAL_CHAR;                                             \
105     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) {       \
106       *nextTokPtr = ptr;                                                       \
107       return XML_TOK_INVALID;                                                  \
108     }                                                                          \
109     ptr += n;                                                                  \
110     break;
111 
112 #  define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)                        \
113   case BT_NONASCII:                                                            \
114     if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {                                   \
115       *nextTokPtr = ptr;                                                       \
116       return XML_TOK_INVALID;                                                  \
117     }                                                                          \
118     /* fall through */                                                         \
119   case BT_NMSTRT:                                                              \
120   case BT_HEX:                                                                 \
121     ptr += MINBPC(enc);                                                        \
122     break;                                                                     \
123     CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr)                            \
124     CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr)                            \
125     CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
126 
127 #  ifndef PREFIX
128 #    define PREFIX(ident) ident
129 #  endif
130 
131 #  define HAS_CHARS(enc, ptr, end, count)                                      \
132     ((end) - (ptr) >= ((count) * MINBPC(enc)))
133 
134 #  define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
135 
136 #  define REQUIRE_CHARS(enc, ptr, end, count)                                  \
137     {                                                                          \
138       if (! HAS_CHARS(enc, ptr, end, count)) {                                 \
139         return XML_TOK_PARTIAL;                                                \
140       }                                                                        \
141     }
142 
143 #  define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
144 
145 /* ptr points to character following "<!-" */
146 
147 static int PTRCALL
PREFIX(scanComment)148 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
149                     const char **nextTokPtr) {
150   if (HAS_CHAR(enc, ptr, end)) {
151     if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
152       *nextTokPtr = ptr;
153       return XML_TOK_INVALID;
154     }
155     ptr += MINBPC(enc);
156     while (HAS_CHAR(enc, ptr, end)) {
157       switch (BYTE_TYPE(enc, ptr)) {
158         INVALID_CASES(ptr, nextTokPtr)
159       case BT_MINUS:
160         ptr += MINBPC(enc);
161         REQUIRE_CHAR(enc, ptr, end);
162         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
163           ptr += MINBPC(enc);
164           REQUIRE_CHAR(enc, ptr, end);
165           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
166             *nextTokPtr = ptr;
167             return XML_TOK_INVALID;
168           }
169           *nextTokPtr = ptr + MINBPC(enc);
170           return XML_TOK_COMMENT;
171         }
172         break;
173       default:
174         ptr += MINBPC(enc);
175         break;
176       }
177     }
178   }
179   return XML_TOK_PARTIAL;
180 }
181 
182 /* ptr points to character following "<!" */
183 
184 static int PTRCALL
PREFIX(scanDecl)185 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
186                  const char **nextTokPtr) {
187   REQUIRE_CHAR(enc, ptr, end);
188   switch (BYTE_TYPE(enc, ptr)) {
189   case BT_MINUS:
190     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
191   case BT_LSQB:
192     *nextTokPtr = ptr + MINBPC(enc);
193     return XML_TOK_COND_SECT_OPEN;
194   case BT_NMSTRT:
195   case BT_HEX:
196     ptr += MINBPC(enc);
197     break;
198   default:
199     *nextTokPtr = ptr;
200     return XML_TOK_INVALID;
201   }
202   while (HAS_CHAR(enc, ptr, end)) {
203     switch (BYTE_TYPE(enc, ptr)) {
204     case BT_PERCNT:
205       REQUIRE_CHARS(enc, ptr, end, 2);
206       /* don't allow <!ENTITY% foo "whatever"> */
207       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
208       case BT_S:
209       case BT_CR:
210       case BT_LF:
211       case BT_PERCNT:
212         *nextTokPtr = ptr;
213         return XML_TOK_INVALID;
214       }
215       /* fall through */
216     case BT_S:
217     case BT_CR:
218     case BT_LF:
219       *nextTokPtr = ptr;
220       return XML_TOK_DECL_OPEN;
221     case BT_NMSTRT:
222     case BT_HEX:
223       ptr += MINBPC(enc);
224       break;
225     default:
226       *nextTokPtr = ptr;
227       return XML_TOK_INVALID;
228     }
229   }
230   return XML_TOK_PARTIAL;
231 }
232 
233 static int PTRCALL
PREFIX(checkPiTarget)234 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
235                       int *tokPtr) {
236   int upper = 0;
237   UNUSED_P(enc);
238   *tokPtr = XML_TOK_PI;
239   if (end - ptr != MINBPC(enc) * 3)
240     return 1;
241   switch (BYTE_TO_ASCII(enc, ptr)) {
242   case ASCII_x:
243     break;
244   case ASCII_X:
245     upper = 1;
246     break;
247   default:
248     return 1;
249   }
250   ptr += MINBPC(enc);
251   switch (BYTE_TO_ASCII(enc, ptr)) {
252   case ASCII_m:
253     break;
254   case ASCII_M:
255     upper = 1;
256     break;
257   default:
258     return 1;
259   }
260   ptr += MINBPC(enc);
261   switch (BYTE_TO_ASCII(enc, ptr)) {
262   case ASCII_l:
263     break;
264   case ASCII_L:
265     upper = 1;
266     break;
267   default:
268     return 1;
269   }
270   if (upper)
271     return 0;
272   *tokPtr = XML_TOK_XML_DECL;
273   return 1;
274 }
275 
276 /* ptr points to character following "<?" */
277 
278 static int PTRCALL
PREFIX(scanPi)279 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
280                const char **nextTokPtr) {
281   int tok;
282   const char *target = ptr;
283   REQUIRE_CHAR(enc, ptr, end);
284   switch (BYTE_TYPE(enc, ptr)) {
285     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
286   default:
287     *nextTokPtr = ptr;
288     return XML_TOK_INVALID;
289   }
290   while (HAS_CHAR(enc, ptr, end)) {
291     switch (BYTE_TYPE(enc, ptr)) {
292       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
293     case BT_S:
294     case BT_CR:
295     case BT_LF:
296       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
297         *nextTokPtr = ptr;
298         return XML_TOK_INVALID;
299       }
300       ptr += MINBPC(enc);
301       while (HAS_CHAR(enc, ptr, end)) {
302         switch (BYTE_TYPE(enc, ptr)) {
303           INVALID_CASES(ptr, nextTokPtr)
304         case BT_QUEST:
305           ptr += MINBPC(enc);
306           REQUIRE_CHAR(enc, ptr, end);
307           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
308             *nextTokPtr = ptr + MINBPC(enc);
309             return tok;
310           }
311           break;
312         default:
313           ptr += MINBPC(enc);
314           break;
315         }
316       }
317       return XML_TOK_PARTIAL;
318     case BT_QUEST:
319       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
320         *nextTokPtr = ptr;
321         return XML_TOK_INVALID;
322       }
323       ptr += MINBPC(enc);
324       REQUIRE_CHAR(enc, ptr, end);
325       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
326         *nextTokPtr = ptr + MINBPC(enc);
327         return tok;
328       }
329       /* fall through */
330     default:
331       *nextTokPtr = ptr;
332       return XML_TOK_INVALID;
333     }
334   }
335   return XML_TOK_PARTIAL;
336 }
337 
338 static int PTRCALL
PREFIX(scanCdataSection)339 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
340                          const char **nextTokPtr) {
341   static const char CDATA_LSQB[]
342       = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
343   int i;
344   UNUSED_P(enc);
345   /* CDATA[ */
346   REQUIRE_CHARS(enc, ptr, end, 6);
347   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
348     if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
349       *nextTokPtr = ptr;
350       return XML_TOK_INVALID;
351     }
352   }
353   *nextTokPtr = ptr;
354   return XML_TOK_CDATA_SECT_OPEN;
355 }
356 
357 static int PTRCALL
PREFIX(cdataSectionTok)358 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
359                         const char **nextTokPtr) {
360   if (ptr >= end)
361     return XML_TOK_NONE;
362   if (MINBPC(enc) > 1) {
363     size_t n = end - ptr;
364     if (n & (MINBPC(enc) - 1)) {
365       n &= ~(MINBPC(enc) - 1);
366       if (n == 0)
367         return XML_TOK_PARTIAL;
368       end = ptr + n;
369     }
370   }
371   switch (BYTE_TYPE(enc, ptr)) {
372   case BT_RSQB:
373     ptr += MINBPC(enc);
374     REQUIRE_CHAR(enc, ptr, end);
375     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
376       break;
377     ptr += MINBPC(enc);
378     REQUIRE_CHAR(enc, ptr, end);
379     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
380       ptr -= MINBPC(enc);
381       break;
382     }
383     *nextTokPtr = ptr + MINBPC(enc);
384     return XML_TOK_CDATA_SECT_CLOSE;
385   case BT_CR:
386     ptr += MINBPC(enc);
387     REQUIRE_CHAR(enc, ptr, end);
388     if (BYTE_TYPE(enc, ptr) == BT_LF)
389       ptr += MINBPC(enc);
390     *nextTokPtr = ptr;
391     return XML_TOK_DATA_NEWLINE;
392   case BT_LF:
393     *nextTokPtr = ptr + MINBPC(enc);
394     return XML_TOK_DATA_NEWLINE;
395     INVALID_CASES(ptr, nextTokPtr)
396   default:
397     ptr += MINBPC(enc);
398     break;
399   }
400   while (HAS_CHAR(enc, ptr, end)) {
401     switch (BYTE_TYPE(enc, ptr)) {
402 #  define LEAD_CASE(n)                                                         \
403   case BT_LEAD##n:                                                             \
404     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
405       *nextTokPtr = ptr;                                                       \
406       return XML_TOK_DATA_CHARS;                                               \
407     }                                                                          \
408     ptr += n;                                                                  \
409     break;
410       LEAD_CASE(2)
411       LEAD_CASE(3)
412       LEAD_CASE(4)
413 #  undef LEAD_CASE
414     case BT_NONXML:
415     case BT_MALFORM:
416     case BT_TRAIL:
417     case BT_CR:
418     case BT_LF:
419     case BT_RSQB:
420       *nextTokPtr = ptr;
421       return XML_TOK_DATA_CHARS;
422     default:
423       ptr += MINBPC(enc);
424       break;
425     }
426   }
427   *nextTokPtr = ptr;
428   return XML_TOK_DATA_CHARS;
429 }
430 
431 /* ptr points to character following "</" */
432 
433 static int PTRCALL
PREFIX(scanEndTag)434 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
435                    const char **nextTokPtr) {
436   REQUIRE_CHAR(enc, ptr, end);
437   switch (BYTE_TYPE(enc, ptr)) {
438     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
439   default:
440     *nextTokPtr = ptr;
441     return XML_TOK_INVALID;
442   }
443   while (HAS_CHAR(enc, ptr, end)) {
444     switch (BYTE_TYPE(enc, ptr)) {
445       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
446     case BT_S:
447     case BT_CR:
448     case BT_LF:
449       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
450         switch (BYTE_TYPE(enc, ptr)) {
451         case BT_S:
452         case BT_CR:
453         case BT_LF:
454           break;
455         case BT_GT:
456           *nextTokPtr = ptr + MINBPC(enc);
457           return XML_TOK_END_TAG;
458         default:
459           *nextTokPtr = ptr;
460           return XML_TOK_INVALID;
461         }
462       }
463       return XML_TOK_PARTIAL;
464 #  ifdef XML_NS
465     case BT_COLON:
466       /* no need to check qname syntax here,
467          since end-tag must match exactly */
468       ptr += MINBPC(enc);
469       break;
470 #  endif
471     case BT_GT:
472       *nextTokPtr = ptr + MINBPC(enc);
473       return XML_TOK_END_TAG;
474     default:
475       *nextTokPtr = ptr;
476       return XML_TOK_INVALID;
477     }
478   }
479   return XML_TOK_PARTIAL;
480 }
481 
482 /* ptr points to character following "&#X" */
483 
484 static int PTRCALL
PREFIX(scanHexCharRef)485 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
486                        const char **nextTokPtr) {
487   if (HAS_CHAR(enc, ptr, end)) {
488     switch (BYTE_TYPE(enc, ptr)) {
489     case BT_DIGIT:
490     case BT_HEX:
491       break;
492     default:
493       *nextTokPtr = ptr;
494       return XML_TOK_INVALID;
495     }
496     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
497       switch (BYTE_TYPE(enc, ptr)) {
498       case BT_DIGIT:
499       case BT_HEX:
500         break;
501       case BT_SEMI:
502         *nextTokPtr = ptr + MINBPC(enc);
503         return XML_TOK_CHAR_REF;
504       default:
505         *nextTokPtr = ptr;
506         return XML_TOK_INVALID;
507       }
508     }
509   }
510   return XML_TOK_PARTIAL;
511 }
512 
513 /* ptr points to character following "&#" */
514 
515 static int PTRCALL
PREFIX(scanCharRef)516 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
517                     const char **nextTokPtr) {
518   if (HAS_CHAR(enc, ptr, end)) {
519     if (CHAR_MATCHES(enc, ptr, ASCII_x))
520       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
521     switch (BYTE_TYPE(enc, ptr)) {
522     case BT_DIGIT:
523       break;
524     default:
525       *nextTokPtr = ptr;
526       return XML_TOK_INVALID;
527     }
528     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
529       switch (BYTE_TYPE(enc, ptr)) {
530       case BT_DIGIT:
531         break;
532       case BT_SEMI:
533         *nextTokPtr = ptr + MINBPC(enc);
534         return XML_TOK_CHAR_REF;
535       default:
536         *nextTokPtr = ptr;
537         return XML_TOK_INVALID;
538       }
539     }
540   }
541   return XML_TOK_PARTIAL;
542 }
543 
544 /* ptr points to character following "&" */
545 
546 static int PTRCALL
PREFIX(scanRef)547 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
548                 const char **nextTokPtr) {
549   REQUIRE_CHAR(enc, ptr, end);
550   switch (BYTE_TYPE(enc, ptr)) {
551     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
552   case BT_NUM:
553     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
554   default:
555     *nextTokPtr = ptr;
556     return XML_TOK_INVALID;
557   }
558   while (HAS_CHAR(enc, ptr, end)) {
559     switch (BYTE_TYPE(enc, ptr)) {
560       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
561     case BT_SEMI:
562       *nextTokPtr = ptr + MINBPC(enc);
563       return XML_TOK_ENTITY_REF;
564     default:
565       *nextTokPtr = ptr;
566       return XML_TOK_INVALID;
567     }
568   }
569   return XML_TOK_PARTIAL;
570 }
571 
572 /* ptr points to character following first character of attribute name */
573 
574 static int PTRCALL
PREFIX(scanAtts)575 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
576                  const char **nextTokPtr) {
577 #  ifdef XML_NS
578   int hadColon = 0;
579 #  endif
580   while (HAS_CHAR(enc, ptr, end)) {
581     switch (BYTE_TYPE(enc, ptr)) {
582       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
583 #  ifdef XML_NS
584     case BT_COLON:
585       if (hadColon) {
586         *nextTokPtr = ptr;
587         return XML_TOK_INVALID;
588       }
589       hadColon = 1;
590       ptr += MINBPC(enc);
591       REQUIRE_CHAR(enc, ptr, end);
592       switch (BYTE_TYPE(enc, ptr)) {
593         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
594       default:
595         *nextTokPtr = ptr;
596         return XML_TOK_INVALID;
597       }
598       break;
599 #  endif
600     case BT_S:
601     case BT_CR:
602     case BT_LF:
603       for (;;) {
604         int t;
605 
606         ptr += MINBPC(enc);
607         REQUIRE_CHAR(enc, ptr, end);
608         t = BYTE_TYPE(enc, ptr);
609         if (t == BT_EQUALS)
610           break;
611         switch (t) {
612         case BT_S:
613         case BT_LF:
614         case BT_CR:
615           break;
616         default:
617           *nextTokPtr = ptr;
618           return XML_TOK_INVALID;
619         }
620       }
621       /* fall through */
622     case BT_EQUALS: {
623       int open;
624 #  ifdef XML_NS
625       hadColon = 0;
626 #  endif
627       for (;;) {
628         ptr += MINBPC(enc);
629         REQUIRE_CHAR(enc, ptr, end);
630         open = BYTE_TYPE(enc, ptr);
631         if (open == BT_QUOT || open == BT_APOS)
632           break;
633         switch (open) {
634         case BT_S:
635         case BT_LF:
636         case BT_CR:
637           break;
638         default:
639           *nextTokPtr = ptr;
640           return XML_TOK_INVALID;
641         }
642       }
643       ptr += MINBPC(enc);
644       /* in attribute value */
645       for (;;) {
646         int t;
647         REQUIRE_CHAR(enc, ptr, end);
648         t = BYTE_TYPE(enc, ptr);
649         if (t == open)
650           break;
651         switch (t) {
652           INVALID_CASES(ptr, nextTokPtr)
653         case BT_AMP: {
654           int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
655           if (tok <= 0) {
656             if (tok == XML_TOK_INVALID)
657               *nextTokPtr = ptr;
658             return tok;
659           }
660           break;
661         }
662         case BT_LT:
663           *nextTokPtr = ptr;
664           return XML_TOK_INVALID;
665         default:
666           ptr += MINBPC(enc);
667           break;
668         }
669       }
670       ptr += MINBPC(enc);
671       REQUIRE_CHAR(enc, ptr, end);
672       switch (BYTE_TYPE(enc, ptr)) {
673       case BT_S:
674       case BT_CR:
675       case BT_LF:
676         break;
677       case BT_SOL:
678         goto sol;
679       case BT_GT:
680         goto gt;
681       default:
682         *nextTokPtr = ptr;
683         return XML_TOK_INVALID;
684       }
685       /* ptr points to closing quote */
686       for (;;) {
687         ptr += MINBPC(enc);
688         REQUIRE_CHAR(enc, ptr, end);
689         switch (BYTE_TYPE(enc, ptr)) {
690           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
691         case BT_S:
692         case BT_CR:
693         case BT_LF:
694           continue;
695         case BT_GT:
696         gt:
697           *nextTokPtr = ptr + MINBPC(enc);
698           return XML_TOK_START_TAG_WITH_ATTS;
699         case BT_SOL:
700         sol:
701           ptr += MINBPC(enc);
702           REQUIRE_CHAR(enc, ptr, end);
703           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
704             *nextTokPtr = ptr;
705             return XML_TOK_INVALID;
706           }
707           *nextTokPtr = ptr + MINBPC(enc);
708           return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
709         default:
710           *nextTokPtr = ptr;
711           return XML_TOK_INVALID;
712         }
713         break;
714       }
715       break;
716     }
717     default:
718       *nextTokPtr = ptr;
719       return XML_TOK_INVALID;
720     }
721   }
722   return XML_TOK_PARTIAL;
723 }
724 
725 /* ptr points to character following "<" */
726 
727 static int PTRCALL
PREFIX(scanLt)728 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
729                const char **nextTokPtr) {
730 #  ifdef XML_NS
731   int hadColon;
732 #  endif
733   REQUIRE_CHAR(enc, ptr, end);
734   switch (BYTE_TYPE(enc, ptr)) {
735     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
736   case BT_EXCL:
737     ptr += MINBPC(enc);
738     REQUIRE_CHAR(enc, ptr, end);
739     switch (BYTE_TYPE(enc, ptr)) {
740     case BT_MINUS:
741       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
742     case BT_LSQB:
743       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
744     }
745     *nextTokPtr = ptr;
746     return XML_TOK_INVALID;
747   case BT_QUEST:
748     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
749   case BT_SOL:
750     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
751   default:
752     *nextTokPtr = ptr;
753     return XML_TOK_INVALID;
754   }
755 #  ifdef XML_NS
756   hadColon = 0;
757 #  endif
758   /* we have a start-tag */
759   while (HAS_CHAR(enc, ptr, end)) {
760     switch (BYTE_TYPE(enc, ptr)) {
761       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
762 #  ifdef XML_NS
763     case BT_COLON:
764       if (hadColon) {
765         *nextTokPtr = ptr;
766         return XML_TOK_INVALID;
767       }
768       hadColon = 1;
769       ptr += MINBPC(enc);
770       REQUIRE_CHAR(enc, ptr, end);
771       switch (BYTE_TYPE(enc, ptr)) {
772         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
773       default:
774         *nextTokPtr = ptr;
775         return XML_TOK_INVALID;
776       }
777       break;
778 #  endif
779     case BT_S:
780     case BT_CR:
781     case BT_LF: {
782       ptr += MINBPC(enc);
783       while (HAS_CHAR(enc, ptr, end)) {
784         switch (BYTE_TYPE(enc, ptr)) {
785           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
786         case BT_GT:
787           goto gt;
788         case BT_SOL:
789           goto sol;
790         case BT_S:
791         case BT_CR:
792         case BT_LF:
793           ptr += MINBPC(enc);
794           continue;
795         default:
796           *nextTokPtr = ptr;
797           return XML_TOK_INVALID;
798         }
799         return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
800       }
801       return XML_TOK_PARTIAL;
802     }
803     case BT_GT:
804     gt:
805       *nextTokPtr = ptr + MINBPC(enc);
806       return XML_TOK_START_TAG_NO_ATTS;
807     case BT_SOL:
808     sol:
809       ptr += MINBPC(enc);
810       REQUIRE_CHAR(enc, ptr, end);
811       if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
812         *nextTokPtr = ptr;
813         return XML_TOK_INVALID;
814       }
815       *nextTokPtr = ptr + MINBPC(enc);
816       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
817     default:
818       *nextTokPtr = ptr;
819       return XML_TOK_INVALID;
820     }
821   }
822   return XML_TOK_PARTIAL;
823 }
824 
825 static int PTRCALL
PREFIX(contentTok)826 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
827                    const char **nextTokPtr) {
828   if (ptr >= end)
829     return XML_TOK_NONE;
830   if (MINBPC(enc) > 1) {
831     size_t n = end - ptr;
832     if (n & (MINBPC(enc) - 1)) {
833       n &= ~(MINBPC(enc) - 1);
834       if (n == 0)
835         return XML_TOK_PARTIAL;
836       end = ptr + n;
837     }
838   }
839   switch (BYTE_TYPE(enc, ptr)) {
840   case BT_LT:
841     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
842   case BT_AMP:
843     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
844   case BT_CR:
845     ptr += MINBPC(enc);
846     if (! HAS_CHAR(enc, ptr, end))
847       return XML_TOK_TRAILING_CR;
848     if (BYTE_TYPE(enc, ptr) == BT_LF)
849       ptr += MINBPC(enc);
850     *nextTokPtr = ptr;
851     return XML_TOK_DATA_NEWLINE;
852   case BT_LF:
853     *nextTokPtr = ptr + MINBPC(enc);
854     return XML_TOK_DATA_NEWLINE;
855   case BT_RSQB:
856     ptr += MINBPC(enc);
857     if (! HAS_CHAR(enc, ptr, end))
858       return XML_TOK_TRAILING_RSQB;
859     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
860       break;
861     ptr += MINBPC(enc);
862     if (! HAS_CHAR(enc, ptr, end))
863       return XML_TOK_TRAILING_RSQB;
864     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
865       ptr -= MINBPC(enc);
866       break;
867     }
868     *nextTokPtr = ptr;
869     return XML_TOK_INVALID;
870     INVALID_CASES(ptr, nextTokPtr)
871   default:
872     ptr += MINBPC(enc);
873     break;
874   }
875   while (HAS_CHAR(enc, ptr, end)) {
876     switch (BYTE_TYPE(enc, ptr)) {
877 #  define LEAD_CASE(n)                                                         \
878   case BT_LEAD##n:                                                             \
879     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
880       *nextTokPtr = ptr;                                                       \
881       return XML_TOK_DATA_CHARS;                                               \
882     }                                                                          \
883     ptr += n;                                                                  \
884     break;
885       LEAD_CASE(2)
886       LEAD_CASE(3)
887       LEAD_CASE(4)
888 #  undef LEAD_CASE
889     case BT_RSQB:
890       if (HAS_CHARS(enc, ptr, end, 2)) {
891         if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
892           ptr += MINBPC(enc);
893           break;
894         }
895         if (HAS_CHARS(enc, ptr, end, 3)) {
896           if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
897             ptr += MINBPC(enc);
898             break;
899           }
900           *nextTokPtr = ptr + 2 * MINBPC(enc);
901           return XML_TOK_INVALID;
902         }
903       }
904       /* fall through */
905     case BT_AMP:
906     case BT_LT:
907     case BT_NONXML:
908     case BT_MALFORM:
909     case BT_TRAIL:
910     case BT_CR:
911     case BT_LF:
912       *nextTokPtr = ptr;
913       return XML_TOK_DATA_CHARS;
914     default:
915       ptr += MINBPC(enc);
916       break;
917     }
918   }
919   *nextTokPtr = ptr;
920   return XML_TOK_DATA_CHARS;
921 }
922 
923 /* ptr points to character following "%" */
924 
925 static int PTRCALL
PREFIX(scanPercent)926 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
927                     const char **nextTokPtr) {
928   REQUIRE_CHAR(enc, ptr, end);
929   switch (BYTE_TYPE(enc, ptr)) {
930     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
931   case BT_S:
932   case BT_LF:
933   case BT_CR:
934   case BT_PERCNT:
935     *nextTokPtr = ptr;
936     return XML_TOK_PERCENT;
937   default:
938     *nextTokPtr = ptr;
939     return XML_TOK_INVALID;
940   }
941   while (HAS_CHAR(enc, ptr, end)) {
942     switch (BYTE_TYPE(enc, ptr)) {
943       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
944     case BT_SEMI:
945       *nextTokPtr = ptr + MINBPC(enc);
946       return XML_TOK_PARAM_ENTITY_REF;
947     default:
948       *nextTokPtr = ptr;
949       return XML_TOK_INVALID;
950     }
951   }
952   return XML_TOK_PARTIAL;
953 }
954 
955 static int PTRCALL
PREFIX(scanPoundName)956 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
957                       const char **nextTokPtr) {
958   REQUIRE_CHAR(enc, ptr, end);
959   switch (BYTE_TYPE(enc, ptr)) {
960     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
961   default:
962     *nextTokPtr = ptr;
963     return XML_TOK_INVALID;
964   }
965   while (HAS_CHAR(enc, ptr, end)) {
966     switch (BYTE_TYPE(enc, ptr)) {
967       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
968     case BT_CR:
969     case BT_LF:
970     case BT_S:
971     case BT_RPAR:
972     case BT_GT:
973     case BT_PERCNT:
974     case BT_VERBAR:
975       *nextTokPtr = ptr;
976       return XML_TOK_POUND_NAME;
977     default:
978       *nextTokPtr = ptr;
979       return XML_TOK_INVALID;
980     }
981   }
982   return -XML_TOK_POUND_NAME;
983 }
984 
985 static int PTRCALL
PREFIX(scanLit)986 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
987                 const char **nextTokPtr) {
988   while (HAS_CHAR(enc, ptr, end)) {
989     int t = BYTE_TYPE(enc, ptr);
990     switch (t) {
991       INVALID_CASES(ptr, nextTokPtr)
992     case BT_QUOT:
993     case BT_APOS:
994       ptr += MINBPC(enc);
995       if (t != open)
996         break;
997       if (! HAS_CHAR(enc, ptr, end))
998         return -XML_TOK_LITERAL;
999       *nextTokPtr = ptr;
1000       switch (BYTE_TYPE(enc, ptr)) {
1001       case BT_S:
1002       case BT_CR:
1003       case BT_LF:
1004       case BT_GT:
1005       case BT_PERCNT:
1006       case BT_LSQB:
1007         return XML_TOK_LITERAL;
1008       default:
1009         return XML_TOK_INVALID;
1010       }
1011     default:
1012       ptr += MINBPC(enc);
1013       break;
1014     }
1015   }
1016   return XML_TOK_PARTIAL;
1017 }
1018 
1019 static int PTRCALL
PREFIX(prologTok)1020 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1021                   const char **nextTokPtr) {
1022   int tok;
1023   if (ptr >= end)
1024     return XML_TOK_NONE;
1025   if (MINBPC(enc) > 1) {
1026     size_t n = end - ptr;
1027     if (n & (MINBPC(enc) - 1)) {
1028       n &= ~(MINBPC(enc) - 1);
1029       if (n == 0)
1030         return XML_TOK_PARTIAL;
1031       end = ptr + n;
1032     }
1033   }
1034   switch (BYTE_TYPE(enc, ptr)) {
1035   case BT_QUOT:
1036     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1037   case BT_APOS:
1038     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1039   case BT_LT: {
1040     ptr += MINBPC(enc);
1041     REQUIRE_CHAR(enc, ptr, end);
1042     switch (BYTE_TYPE(enc, ptr)) {
1043     case BT_EXCL:
1044       return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1045     case BT_QUEST:
1046       return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1047     case BT_NMSTRT:
1048     case BT_HEX:
1049     case BT_NONASCII:
1050     case BT_LEAD2:
1051     case BT_LEAD3:
1052     case BT_LEAD4:
1053       *nextTokPtr = ptr - MINBPC(enc);
1054       return XML_TOK_INSTANCE_START;
1055     }
1056     *nextTokPtr = ptr;
1057     return XML_TOK_INVALID;
1058   }
1059   case BT_CR:
1060     if (ptr + MINBPC(enc) == end) {
1061       *nextTokPtr = end;
1062       /* indicate that this might be part of a CR/LF pair */
1063       return -XML_TOK_PROLOG_S;
1064     }
1065     /* fall through */
1066   case BT_S:
1067   case BT_LF:
1068     for (;;) {
1069       ptr += MINBPC(enc);
1070       if (! HAS_CHAR(enc, ptr, end))
1071         break;
1072       switch (BYTE_TYPE(enc, ptr)) {
1073       case BT_S:
1074       case BT_LF:
1075         break;
1076       case BT_CR:
1077         /* don't split CR/LF pair */
1078         if (ptr + MINBPC(enc) != end)
1079           break;
1080         /* fall through */
1081       default:
1082         *nextTokPtr = ptr;
1083         return XML_TOK_PROLOG_S;
1084       }
1085     }
1086     *nextTokPtr = ptr;
1087     return XML_TOK_PROLOG_S;
1088   case BT_PERCNT:
1089     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1090   case BT_COMMA:
1091     *nextTokPtr = ptr + MINBPC(enc);
1092     return XML_TOK_COMMA;
1093   case BT_LSQB:
1094     *nextTokPtr = ptr + MINBPC(enc);
1095     return XML_TOK_OPEN_BRACKET;
1096   case BT_RSQB:
1097     ptr += MINBPC(enc);
1098     if (! HAS_CHAR(enc, ptr, end))
1099       return -XML_TOK_CLOSE_BRACKET;
1100     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1101       REQUIRE_CHARS(enc, ptr, end, 2);
1102       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1103         *nextTokPtr = ptr + 2 * MINBPC(enc);
1104         return XML_TOK_COND_SECT_CLOSE;
1105       }
1106     }
1107     *nextTokPtr = ptr;
1108     return XML_TOK_CLOSE_BRACKET;
1109   case BT_LPAR:
1110     *nextTokPtr = ptr + MINBPC(enc);
1111     return XML_TOK_OPEN_PAREN;
1112   case BT_RPAR:
1113     ptr += MINBPC(enc);
1114     if (! HAS_CHAR(enc, ptr, end))
1115       return -XML_TOK_CLOSE_PAREN;
1116     switch (BYTE_TYPE(enc, ptr)) {
1117     case BT_AST:
1118       *nextTokPtr = ptr + MINBPC(enc);
1119       return XML_TOK_CLOSE_PAREN_ASTERISK;
1120     case BT_QUEST:
1121       *nextTokPtr = ptr + MINBPC(enc);
1122       return XML_TOK_CLOSE_PAREN_QUESTION;
1123     case BT_PLUS:
1124       *nextTokPtr = ptr + MINBPC(enc);
1125       return XML_TOK_CLOSE_PAREN_PLUS;
1126     case BT_CR:
1127     case BT_LF:
1128     case BT_S:
1129     case BT_GT:
1130     case BT_COMMA:
1131     case BT_VERBAR:
1132     case BT_RPAR:
1133       *nextTokPtr = ptr;
1134       return XML_TOK_CLOSE_PAREN;
1135     }
1136     *nextTokPtr = ptr;
1137     return XML_TOK_INVALID;
1138   case BT_VERBAR:
1139     *nextTokPtr = ptr + MINBPC(enc);
1140     return XML_TOK_OR;
1141   case BT_GT:
1142     *nextTokPtr = ptr + MINBPC(enc);
1143     return XML_TOK_DECL_CLOSE;
1144   case BT_NUM:
1145     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1146 #  define LEAD_CASE(n)                                                         \
1147   case BT_LEAD##n:                                                             \
1148     if (end - ptr < n)                                                         \
1149       return XML_TOK_PARTIAL_CHAR;                                             \
1150     if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
1151       *nextTokPtr = ptr;                                                       \
1152       return XML_TOK_INVALID;                                                  \
1153     }                                                                          \
1154     if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
1155       ptr += n;                                                                \
1156       tok = XML_TOK_NAME;                                                      \
1157       break;                                                                   \
1158     }                                                                          \
1159     if (IS_NAME_CHAR(enc, ptr, n)) {                                           \
1160       ptr += n;                                                                \
1161       tok = XML_TOK_NMTOKEN;                                                   \
1162       break;                                                                   \
1163     }                                                                          \
1164     *nextTokPtr = ptr;                                                         \
1165     return XML_TOK_INVALID;
1166     LEAD_CASE(2)
1167     LEAD_CASE(3)
1168     LEAD_CASE(4)
1169 #  undef LEAD_CASE
1170   case BT_NMSTRT:
1171   case BT_HEX:
1172     tok = XML_TOK_NAME;
1173     ptr += MINBPC(enc);
1174     break;
1175   case BT_DIGIT:
1176   case BT_NAME:
1177   case BT_MINUS:
1178 #  ifdef XML_NS
1179   case BT_COLON:
1180 #  endif
1181     tok = XML_TOK_NMTOKEN;
1182     ptr += MINBPC(enc);
1183     break;
1184   case BT_NONASCII:
1185     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1186       ptr += MINBPC(enc);
1187       tok = XML_TOK_NAME;
1188       break;
1189     }
1190     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1191       ptr += MINBPC(enc);
1192       tok = XML_TOK_NMTOKEN;
1193       break;
1194     }
1195     /* fall through */
1196   default:
1197     *nextTokPtr = ptr;
1198     return XML_TOK_INVALID;
1199   }
1200   while (HAS_CHAR(enc, ptr, end)) {
1201     switch (BYTE_TYPE(enc, ptr)) {
1202       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1203     case BT_GT:
1204     case BT_RPAR:
1205     case BT_COMMA:
1206     case BT_VERBAR:
1207     case BT_LSQB:
1208     case BT_PERCNT:
1209     case BT_S:
1210     case BT_CR:
1211     case BT_LF:
1212       *nextTokPtr = ptr;
1213       return tok;
1214 #  ifdef XML_NS
1215     case BT_COLON:
1216       ptr += MINBPC(enc);
1217       switch (tok) {
1218       case XML_TOK_NAME:
1219         REQUIRE_CHAR(enc, ptr, end);
1220         tok = XML_TOK_PREFIXED_NAME;
1221         switch (BYTE_TYPE(enc, ptr)) {
1222           CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1223         default:
1224           tok = XML_TOK_NMTOKEN;
1225           break;
1226         }
1227         break;
1228       case XML_TOK_PREFIXED_NAME:
1229         tok = XML_TOK_NMTOKEN;
1230         break;
1231       }
1232       break;
1233 #  endif
1234     case BT_PLUS:
1235       if (tok == XML_TOK_NMTOKEN) {
1236         *nextTokPtr = ptr;
1237         return XML_TOK_INVALID;
1238       }
1239       *nextTokPtr = ptr + MINBPC(enc);
1240       return XML_TOK_NAME_PLUS;
1241     case BT_AST:
1242       if (tok == XML_TOK_NMTOKEN) {
1243         *nextTokPtr = ptr;
1244         return XML_TOK_INVALID;
1245       }
1246       *nextTokPtr = ptr + MINBPC(enc);
1247       return XML_TOK_NAME_ASTERISK;
1248     case BT_QUEST:
1249       if (tok == XML_TOK_NMTOKEN) {
1250         *nextTokPtr = ptr;
1251         return XML_TOK_INVALID;
1252       }
1253       *nextTokPtr = ptr + MINBPC(enc);
1254       return XML_TOK_NAME_QUESTION;
1255     default:
1256       *nextTokPtr = ptr;
1257       return XML_TOK_INVALID;
1258     }
1259   }
1260   return -tok;
1261 }
1262 
1263 static int PTRCALL
PREFIX(attributeValueTok)1264 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1265                           const char **nextTokPtr) {
1266   const char *start;
1267   if (ptr >= end)
1268     return XML_TOK_NONE;
1269   else if (! HAS_CHAR(enc, ptr, end)) {
1270     /* This line cannot be executed.  The incoming data has already
1271      * been tokenized once, so incomplete characters like this have
1272      * already been eliminated from the input.  Retaining the paranoia
1273      * check is still valuable, however.
1274      */
1275     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1276   }
1277   start = ptr;
1278   while (HAS_CHAR(enc, ptr, end)) {
1279     switch (BYTE_TYPE(enc, ptr)) {
1280 #  define LEAD_CASE(n)                                                         \
1281   case BT_LEAD##n:                                                             \
1282     ptr += n; /* NOTE: The encoding has already been validated. */             \
1283     break;
1284       LEAD_CASE(2)
1285       LEAD_CASE(3)
1286       LEAD_CASE(4)
1287 #  undef LEAD_CASE
1288     case BT_AMP:
1289       if (ptr == start)
1290         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1291       *nextTokPtr = ptr;
1292       return XML_TOK_DATA_CHARS;
1293     case BT_LT:
1294       /* this is for inside entity references */
1295       *nextTokPtr = ptr;
1296       return XML_TOK_INVALID;
1297     case BT_LF:
1298       if (ptr == start) {
1299         *nextTokPtr = ptr + MINBPC(enc);
1300         return XML_TOK_DATA_NEWLINE;
1301       }
1302       *nextTokPtr = ptr;
1303       return XML_TOK_DATA_CHARS;
1304     case BT_CR:
1305       if (ptr == start) {
1306         ptr += MINBPC(enc);
1307         if (! HAS_CHAR(enc, ptr, end))
1308           return XML_TOK_TRAILING_CR;
1309         if (BYTE_TYPE(enc, ptr) == BT_LF)
1310           ptr += MINBPC(enc);
1311         *nextTokPtr = ptr;
1312         return XML_TOK_DATA_NEWLINE;
1313       }
1314       *nextTokPtr = ptr;
1315       return XML_TOK_DATA_CHARS;
1316     case BT_S:
1317       if (ptr == start) {
1318         *nextTokPtr = ptr + MINBPC(enc);
1319         return XML_TOK_ATTRIBUTE_VALUE_S;
1320       }
1321       *nextTokPtr = ptr;
1322       return XML_TOK_DATA_CHARS;
1323     default:
1324       ptr += MINBPC(enc);
1325       break;
1326     }
1327   }
1328   *nextTokPtr = ptr;
1329   return XML_TOK_DATA_CHARS;
1330 }
1331 
1332 static int PTRCALL
PREFIX(entityValueTok)1333 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1334                        const char **nextTokPtr) {
1335   const char *start;
1336   if (ptr >= end)
1337     return XML_TOK_NONE;
1338   else if (! HAS_CHAR(enc, ptr, end)) {
1339     /* This line cannot be executed.  The incoming data has already
1340      * been tokenized once, so incomplete characters like this have
1341      * already been eliminated from the input.  Retaining the paranoia
1342      * check is still valuable, however.
1343      */
1344     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1345   }
1346   start = ptr;
1347   while (HAS_CHAR(enc, ptr, end)) {
1348     switch (BYTE_TYPE(enc, ptr)) {
1349 #  define LEAD_CASE(n)                                                         \
1350   case BT_LEAD##n:                                                             \
1351     ptr += n; /* NOTE: The encoding has already been validated. */             \
1352     break;
1353       LEAD_CASE(2)
1354       LEAD_CASE(3)
1355       LEAD_CASE(4)
1356 #  undef LEAD_CASE
1357     case BT_AMP:
1358       if (ptr == start)
1359         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1360       *nextTokPtr = ptr;
1361       return XML_TOK_DATA_CHARS;
1362     case BT_PERCNT:
1363       if (ptr == start) {
1364         int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1365         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1366       }
1367       *nextTokPtr = ptr;
1368       return XML_TOK_DATA_CHARS;
1369     case BT_LF:
1370       if (ptr == start) {
1371         *nextTokPtr = ptr + MINBPC(enc);
1372         return XML_TOK_DATA_NEWLINE;
1373       }
1374       *nextTokPtr = ptr;
1375       return XML_TOK_DATA_CHARS;
1376     case BT_CR:
1377       if (ptr == start) {
1378         ptr += MINBPC(enc);
1379         if (! HAS_CHAR(enc, ptr, end))
1380           return XML_TOK_TRAILING_CR;
1381         if (BYTE_TYPE(enc, ptr) == BT_LF)
1382           ptr += MINBPC(enc);
1383         *nextTokPtr = ptr;
1384         return XML_TOK_DATA_NEWLINE;
1385       }
1386       *nextTokPtr = ptr;
1387       return XML_TOK_DATA_CHARS;
1388     default:
1389       ptr += MINBPC(enc);
1390       break;
1391     }
1392   }
1393   *nextTokPtr = ptr;
1394   return XML_TOK_DATA_CHARS;
1395 }
1396 
1397 #  ifdef XML_DTD
1398 
1399 static int PTRCALL
PREFIX(ignoreSectionTok)1400 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1401                          const char **nextTokPtr) {
1402   int level = 0;
1403   if (MINBPC(enc) > 1) {
1404     size_t n = end - ptr;
1405     if (n & (MINBPC(enc) - 1)) {
1406       n &= ~(MINBPC(enc) - 1);
1407       end = ptr + n;
1408     }
1409   }
1410   while (HAS_CHAR(enc, ptr, end)) {
1411     switch (BYTE_TYPE(enc, ptr)) {
1412       INVALID_CASES(ptr, nextTokPtr)
1413     case BT_LT:
1414       ptr += MINBPC(enc);
1415       REQUIRE_CHAR(enc, ptr, end);
1416       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1417         ptr += MINBPC(enc);
1418         REQUIRE_CHAR(enc, ptr, end);
1419         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1420           ++level;
1421           ptr += MINBPC(enc);
1422         }
1423       }
1424       break;
1425     case BT_RSQB:
1426       ptr += MINBPC(enc);
1427       REQUIRE_CHAR(enc, ptr, end);
1428       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1429         ptr += MINBPC(enc);
1430         REQUIRE_CHAR(enc, ptr, end);
1431         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1432           ptr += MINBPC(enc);
1433           if (level == 0) {
1434             *nextTokPtr = ptr;
1435             return XML_TOK_IGNORE_SECT;
1436           }
1437           --level;
1438         }
1439       }
1440       break;
1441     default:
1442       ptr += MINBPC(enc);
1443       break;
1444     }
1445   }
1446   return XML_TOK_PARTIAL;
1447 }
1448 
1449 #  endif /* XML_DTD */
1450 
1451 static int PTRCALL
PREFIX(isPublicId)1452 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1453                    const char **badPtr) {
1454   ptr += MINBPC(enc);
1455   end -= MINBPC(enc);
1456   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1457     switch (BYTE_TYPE(enc, ptr)) {
1458     case BT_DIGIT:
1459     case BT_HEX:
1460     case BT_MINUS:
1461     case BT_APOS:
1462     case BT_LPAR:
1463     case BT_RPAR:
1464     case BT_PLUS:
1465     case BT_COMMA:
1466     case BT_SOL:
1467     case BT_EQUALS:
1468     case BT_QUEST:
1469     case BT_CR:
1470     case BT_LF:
1471     case BT_SEMI:
1472     case BT_EXCL:
1473     case BT_AST:
1474     case BT_PERCNT:
1475     case BT_NUM:
1476 #  ifdef XML_NS
1477     case BT_COLON:
1478 #  endif
1479       break;
1480     case BT_S:
1481       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1482         *badPtr = ptr;
1483         return 0;
1484       }
1485       break;
1486     case BT_NAME:
1487     case BT_NMSTRT:
1488       if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1489         break;
1490       /* fall through */
1491     default:
1492       switch (BYTE_TO_ASCII(enc, ptr)) {
1493       case 0x24: /* $ */
1494       case 0x40: /* @ */
1495         break;
1496       default:
1497         *badPtr = ptr;
1498         return 0;
1499       }
1500       break;
1501     }
1502   }
1503   return 1;
1504 }
1505 
1506 /* This must only be called for a well-formed start-tag or empty
1507    element tag.  Returns the number of attributes.  Pointers to the
1508    first attsMax attributes are stored in atts.
1509 */
1510 
1511 static int PTRCALL
PREFIX(getAtts)1512 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1513                 ATTRIBUTE *atts) {
1514   enum { other, inName, inValue } state = inName;
1515   int nAtts = 0;
1516   int open = 0; /* defined when state == inValue;
1517                    initialization just to shut up compilers */
1518 
1519   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1520     switch (BYTE_TYPE(enc, ptr)) {
1521 #  define START_NAME                                                           \
1522     if (state == other) {                                                      \
1523       if (nAtts < attsMax) {                                                   \
1524         atts[nAtts].name = ptr;                                                \
1525         atts[nAtts].normalized = 1;                                            \
1526       }                                                                        \
1527       state = inName;                                                          \
1528     }
1529 #  define LEAD_CASE(n)                                                         \
1530   case BT_LEAD##n: /* NOTE: The encoding has already been validated. */        \
1531     START_NAME ptr += (n - MINBPC(enc));                                       \
1532     break;
1533       LEAD_CASE(2)
1534       LEAD_CASE(3)
1535       LEAD_CASE(4)
1536 #  undef LEAD_CASE
1537     case BT_NONASCII:
1538     case BT_NMSTRT:
1539     case BT_HEX:
1540       START_NAME
1541       break;
1542 #  undef START_NAME
1543     case BT_QUOT:
1544       if (state != inValue) {
1545         if (nAtts < attsMax)
1546           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1547         state = inValue;
1548         open = BT_QUOT;
1549       } else if (open == BT_QUOT) {
1550         state = other;
1551         if (nAtts < attsMax)
1552           atts[nAtts].valueEnd = ptr;
1553         nAtts++;
1554       }
1555       break;
1556     case BT_APOS:
1557       if (state != inValue) {
1558         if (nAtts < attsMax)
1559           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1560         state = inValue;
1561         open = BT_APOS;
1562       } else if (open == BT_APOS) {
1563         state = other;
1564         if (nAtts < attsMax)
1565           atts[nAtts].valueEnd = ptr;
1566         nAtts++;
1567       }
1568       break;
1569     case BT_AMP:
1570       if (nAtts < attsMax)
1571         atts[nAtts].normalized = 0;
1572       break;
1573     case BT_S:
1574       if (state == inName)
1575         state = other;
1576       else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1577                && (ptr == atts[nAtts].valuePtr
1578                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1579                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1580                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1581         atts[nAtts].normalized = 0;
1582       break;
1583     case BT_CR:
1584     case BT_LF:
1585       /* This case ensures that the first attribute name is counted
1586          Apart from that we could just change state on the quote. */
1587       if (state == inName)
1588         state = other;
1589       else if (state == inValue && nAtts < attsMax)
1590         atts[nAtts].normalized = 0;
1591       break;
1592     case BT_GT:
1593     case BT_SOL:
1594       if (state != inValue)
1595         return nAtts;
1596       break;
1597     default:
1598       break;
1599     }
1600   }
1601   /* not reached */
1602 }
1603 
1604 static int PTRFASTCALL
PREFIX(charRefNumber)1605 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1606   int result = 0;
1607   /* skip &# */
1608   UNUSED_P(enc);
1609   ptr += 2 * MINBPC(enc);
1610   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1611     for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1612          ptr += MINBPC(enc)) {
1613       int c = BYTE_TO_ASCII(enc, ptr);
1614       switch (c) {
1615       case ASCII_0:
1616       case ASCII_1:
1617       case ASCII_2:
1618       case ASCII_3:
1619       case ASCII_4:
1620       case ASCII_5:
1621       case ASCII_6:
1622       case ASCII_7:
1623       case ASCII_8:
1624       case ASCII_9:
1625         result <<= 4;
1626         result |= (c - ASCII_0);
1627         break;
1628       case ASCII_A:
1629       case ASCII_B:
1630       case ASCII_C:
1631       case ASCII_D:
1632       case ASCII_E:
1633       case ASCII_F:
1634         result <<= 4;
1635         result += 10 + (c - ASCII_A);
1636         break;
1637       case ASCII_a:
1638       case ASCII_b:
1639       case ASCII_c:
1640       case ASCII_d:
1641       case ASCII_e:
1642       case ASCII_f:
1643         result <<= 4;
1644         result += 10 + (c - ASCII_a);
1645         break;
1646       }
1647       if (result >= 0x110000)
1648         return -1;
1649     }
1650   } else {
1651     for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1652       int c = BYTE_TO_ASCII(enc, ptr);
1653       result *= 10;
1654       result += (c - ASCII_0);
1655       if (result >= 0x110000)
1656         return -1;
1657     }
1658   }
1659   return checkCharRefNumber(result);
1660 }
1661 
1662 static int PTRCALL
PREFIX(predefinedEntityName)1663 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1664                              const char *end) {
1665   UNUSED_P(enc);
1666   switch ((end - ptr) / MINBPC(enc)) {
1667   case 2:
1668     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1669       switch (BYTE_TO_ASCII(enc, ptr)) {
1670       case ASCII_l:
1671         return ASCII_LT;
1672       case ASCII_g:
1673         return ASCII_GT;
1674       }
1675     }
1676     break;
1677   case 3:
1678     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1679       ptr += MINBPC(enc);
1680       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1681         ptr += MINBPC(enc);
1682         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1683           return ASCII_AMP;
1684       }
1685     }
1686     break;
1687   case 4:
1688     switch (BYTE_TO_ASCII(enc, ptr)) {
1689     case ASCII_q:
1690       ptr += MINBPC(enc);
1691       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1692         ptr += MINBPC(enc);
1693         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1694           ptr += MINBPC(enc);
1695           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1696             return ASCII_QUOT;
1697         }
1698       }
1699       break;
1700     case ASCII_a:
1701       ptr += MINBPC(enc);
1702       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1703         ptr += MINBPC(enc);
1704         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1705           ptr += MINBPC(enc);
1706           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1707             return ASCII_APOS;
1708         }
1709       }
1710       break;
1711     }
1712   }
1713   return 0;
1714 }
1715 
1716 static int PTRCALL
PREFIX(nameMatchesAscii)1717 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1718                          const char *end1, const char *ptr2) {
1719   UNUSED_P(enc);
1720   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1721     if (end1 - ptr1 < MINBPC(enc)) {
1722       /* This line cannot be executed.  The incoming data has already
1723        * been tokenized once, so incomplete characters like this have
1724        * already been eliminated from the input.  Retaining the
1725        * paranoia check is still valuable, however.
1726        */
1727       return 0; /* LCOV_EXCL_LINE */
1728     }
1729     if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1730       return 0;
1731   }
1732   return ptr1 == end1;
1733 }
1734 
1735 static int PTRFASTCALL
PREFIX(nameLength)1736 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1737   const char *start = ptr;
1738   for (;;) {
1739     switch (BYTE_TYPE(enc, ptr)) {
1740 #  define LEAD_CASE(n)                                                         \
1741   case BT_LEAD##n:                                                             \
1742     ptr += n; /* NOTE: The encoding has already been validated. */             \
1743     break;
1744       LEAD_CASE(2)
1745       LEAD_CASE(3)
1746       LEAD_CASE(4)
1747 #  undef LEAD_CASE
1748     case BT_NONASCII:
1749     case BT_NMSTRT:
1750 #  ifdef XML_NS
1751     case BT_COLON:
1752 #  endif
1753     case BT_HEX:
1754     case BT_DIGIT:
1755     case BT_NAME:
1756     case BT_MINUS:
1757       ptr += MINBPC(enc);
1758       break;
1759     default:
1760       return (int)(ptr - start);
1761     }
1762   }
1763 }
1764 
1765 static const char *PTRFASTCALL
PREFIX(skipS)1766 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1767   for (;;) {
1768     switch (BYTE_TYPE(enc, ptr)) {
1769     case BT_LF:
1770     case BT_CR:
1771     case BT_S:
1772       ptr += MINBPC(enc);
1773       break;
1774     default:
1775       return ptr;
1776     }
1777   }
1778 }
1779 
1780 static void PTRCALL
PREFIX(updatePosition)1781 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1782                        POSITION *pos) {
1783   while (HAS_CHAR(enc, ptr, end)) {
1784     switch (BYTE_TYPE(enc, ptr)) {
1785 #  define LEAD_CASE(n)                                                         \
1786   case BT_LEAD##n:                                                             \
1787     ptr += n; /* NOTE: The encoding has already been validated. */             \
1788     pos->columnNumber++;                                                       \
1789     break;
1790       LEAD_CASE(2)
1791       LEAD_CASE(3)
1792       LEAD_CASE(4)
1793 #  undef LEAD_CASE
1794     case BT_LF:
1795       pos->columnNumber = 0;
1796       pos->lineNumber++;
1797       ptr += MINBPC(enc);
1798       break;
1799     case BT_CR:
1800       pos->lineNumber++;
1801       ptr += MINBPC(enc);
1802       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1803         ptr += MINBPC(enc);
1804       pos->columnNumber = 0;
1805       break;
1806     default:
1807       ptr += MINBPC(enc);
1808       pos->columnNumber++;
1809       break;
1810     }
1811   }
1812 }
1813 
1814 #  undef DO_LEAD_CASE
1815 #  undef MULTIBYTE_CASES
1816 #  undef INVALID_CASES
1817 #  undef CHECK_NAME_CASE
1818 #  undef CHECK_NAME_CASES
1819 #  undef CHECK_NMSTRT_CASE
1820 #  undef CHECK_NMSTRT_CASES
1821 
1822 #endif /* XML_TOK_IMPL_C */
1823 
1824 #endif /* LV_USE_XML */
1825 
1826