1 /* SPDX-License-Identifier: MIT */
2
3 /* Based on src/http/ngx_http_parse.c from NGINX copyright Igor Sysoev
4 *
5 * Additional changes are licensed under the same terms as NGINX and
6 * copyright Joyent, Inc. and other Node contributors. All rights reserved.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to
10 * deal in the Software without restriction, including without limitation the
11 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
12 * sell copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * IN THE SOFTWARE.
25 */
26 #include <zephyr/sys/__assert.h>
27 #include <stddef.h>
28 #include <ctype.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <limits.h>
32 #include <zephyr/net/http/parser_url.h>
33 #include <zephyr/toolchain.h>
34
35 #ifndef BIT_AT
36 # define BIT_AT(a, i) \
37 (!!((unsigned int) (a)[(unsigned int) (i) >> 3] & \
38 (1 << ((unsigned int) (i) & 7))))
39 #endif
40
41 /* Set the mark FOR; non-destructive if mark is already set */
42 #define MARK(FOR) \
43 do { \
44 if (!FOR##_mark) { \
45 FOR##_mark = p; \
46 } \
47 } while (false)
48
49
50 #ifdef HTTP_PARSER_STRICT
51 # define T(v) 0
52 #else
53 # define T(v) v
54 #endif
55
56
57 static const uint8_t normal_url_char[32] = {
58 /* 0 nul 1 soh 2 stx 3 etx 4 eot 5 enq 6 ack 7 bel */
59 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
60 /* 8 bs 9 ht 10 nl 11 vt 12 np 13 cr 14 so 15 si */
61 0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0,
62 /* 16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */
63 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
64 /* 24 can 25 em 26 sub 27 esc 28 fs 29 gs 30 rs 31 us */
65 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
66 /* 32 sp 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' */
67 0 | 2 | 4 | 0 | 16 | 32 | 64 | 128,
68 /* 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / */
69 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
70 /* 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 */
71 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
72 /* 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? */
73 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
74 /* 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G */
75 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
76 /* 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O */
77 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
78 /* 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W */
79 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
80 /* 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ */
81 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
82 /* 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g */
83 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
84 /* 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o */
85 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
86 /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */
87 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
88 /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */
89 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
90 };
91
92 #undef T
93
94 enum http_host_state {
95 s_http_host_dead = 1,
96 s_http_userinfo_start,
97 s_http_userinfo,
98 s_http_host_start,
99 s_http_host_v6_start,
100 s_http_host,
101 s_http_host_v6,
102 s_http_host_v6_end,
103 s_http_host_v6_zone_start,
104 s_http_host_v6_zone,
105 s_http_host_port_start,
106 s_http_host_port
107 };
108
109
110 /* Macros for character classes; depends on strict-mode */
111 #define LOWER(c) (unsigned char)(c | 0x20)
112 #define IS_ALPHA(c) (LOWER(c) >= 'a' && LOWER(c) <= 'z')
113 #define IS_NUM(c) ((c) >= '0' && (c) <= '9')
114 #define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c))
115 #define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
116
117 #define IS_MARK(c) ((c) == '-' || (c) == '_' || (c) == '.' || \
118 (c) == '!' || (c) == '~' || (c) == '*' || \
119 (c) == '\'' || (c) == '(' || (c) == ')')
120
121 #define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
122 (c) == ';' || (c) == ':' || (c) == '&' || \
123 (c) == '=' || (c) == '+' || (c) == '$' || \
124 (c) == ',')
125
126 #ifdef HTTP_PARSER_STRICT
127 #define IS_URL_CHAR(c) (BIT_AT(normal_url_char, (unsigned char)c))
128 #define IS_HOST_CHAR(c) (IS_ALPHANUM(c) || (c) == '.' || (c) == '-')
129 #else
130 #define IS_URL_CHAR(c) \
131 (BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
132 #define IS_HOST_CHAR(c) \
133 (IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_')
134 #endif
135
136 /* Our URL parser.
137 *
138 * This is designed to be shared by http_parser_execute() for URL validation,
139 * hence it has a state transition + byte-for-byte interface. In addition, it
140 * is meant to be embedded in http_parser_parse_url(), which does the dirty
141 * work of turning state transitions URL components for its API.
142 *
143 * This function should only be invoked with non-space characters. It is
144 * assumed that the caller cares about (and can detect) the transition between
145 * URL and non-URL states by looking for these.
146 */
parse_url_char(enum state s,const char ch)147 enum state parse_url_char(enum state s, const char ch)
148 {
149 if (ch == ' ' || ch == '\r' || ch == '\n') {
150 return s_dead;
151 }
152
153 #ifdef HTTP_PARSER_STRICT
154 if (ch == '\t' || ch == '\f') {
155 return s_dead;
156 }
157 #endif
158
159 switch (s) {
160 case s_req_spaces_before_url:
161 /* Proxied requests are followed by scheme of an absolute URI
162 * (alpha).
163 * All methods except CONNECT are followed by '/' or '*'.
164 */
165
166 if (ch == '/' || ch == '*') {
167 return s_req_path;
168 }
169
170 if (IS_ALPHA(ch)) {
171 return s_req_schema;
172 }
173
174 break;
175
176 case s_req_schema:
177 if (IS_ALPHA(ch)) {
178 return s;
179 }
180
181 if (ch == ':') {
182 return s_req_schema_slash;
183 }
184
185 break;
186
187 case s_req_schema_slash:
188 if (ch == '/') {
189 return s_req_schema_slash_slash;
190 }
191
192 break;
193
194 case s_req_schema_slash_slash:
195 if (ch == '/') {
196 return s_req_server_start;
197 }
198
199 break;
200
201 case s_req_server_with_at:
202 if (ch == '@') {
203 return s_dead;
204 }
205
206 __fallthrough;
207 case s_req_server_start:
208 case s_req_server:
209 if (ch == '/') {
210 return s_req_path;
211 }
212
213 if (ch == '?') {
214 return s_req_query_string_start;
215 }
216
217 if (ch == '@') {
218 return s_req_server_with_at;
219 }
220
221 if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
222 return s_req_server;
223 }
224
225 break;
226
227 case s_req_path:
228 if (IS_URL_CHAR(ch)) {
229 return s;
230 }
231
232 switch (ch) {
233 case '?':
234 return s_req_query_string_start;
235
236 case '#':
237 return s_req_fragment_start;
238 }
239
240 break;
241
242 case s_req_query_string_start:
243 case s_req_query_string:
244 if (IS_URL_CHAR(ch)) {
245 return s_req_query_string;
246 }
247
248 switch (ch) {
249 case '?':
250 /* allow extra '?' in query string */
251 return s_req_query_string;
252
253 case '#':
254 return s_req_fragment_start;
255 }
256
257 break;
258
259 case s_req_fragment_start:
260 if (IS_URL_CHAR(ch)) {
261 return s_req_fragment;
262 }
263
264 switch (ch) {
265 case '?':
266 return s_req_fragment;
267
268 case '#':
269 return s;
270 }
271
272 break;
273
274 case s_req_fragment:
275 if (IS_URL_CHAR(ch)) {
276 return s;
277 }
278
279 switch (ch) {
280 case '?':
281 case '#':
282 return s;
283 }
284
285 break;
286
287 default:
288 break;
289 }
290
291 /* We should never fall out of the switch above unless there's
292 * an error
293 */
294 return s_dead;
295 }
296
297 static enum http_host_state
http_parse_host_char(enum http_host_state s,const char ch)298 http_parse_host_char(enum http_host_state s, const char ch)
299 {
300 switch (s) {
301 case s_http_userinfo:
302 case s_http_userinfo_start:
303 if (ch == '@') {
304 return s_http_host_start;
305 }
306
307 if (IS_USERINFO_CHAR(ch)) {
308 return s_http_userinfo;
309 }
310 break;
311
312 case s_http_host_start:
313 if (ch == '[') {
314 return s_http_host_v6_start;
315 }
316
317 if (IS_HOST_CHAR(ch)) {
318 return s_http_host;
319 }
320
321 break;
322
323 case s_http_host:
324 if (IS_HOST_CHAR(ch)) {
325 return s_http_host;
326 }
327
328 __fallthrough;
329 case s_http_host_v6_end:
330 if (ch == ':') {
331 return s_http_host_port_start;
332 }
333
334 break;
335
336 case s_http_host_v6:
337 if (ch == ']') {
338 return s_http_host_v6_end;
339 }
340
341 __fallthrough;
342 case s_http_host_v6_start:
343 if (IS_HEX(ch) || ch == ':' || ch == '.') {
344 return s_http_host_v6;
345 }
346
347 if (s == s_http_host_v6 && ch == '%') {
348 return s_http_host_v6_zone_start;
349 }
350 break;
351
352 case s_http_host_v6_zone:
353 if (ch == ']') {
354 return s_http_host_v6_end;
355 }
356
357 __fallthrough;
358 case s_http_host_v6_zone_start:
359 /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
360 if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' ||
361 ch == '_' ||
362 ch == '~') {
363 return s_http_host_v6_zone;
364 }
365 break;
366
367 case s_http_host_port:
368 case s_http_host_port_start:
369 if (IS_NUM(ch)) {
370 return s_http_host_port;
371 }
372
373 break;
374
375 default:
376 break;
377 }
378 return s_http_host_dead;
379 }
380
381 static
http_parse_host(const char * buf,struct http_parser_url * u,int found_at)382 int http_parse_host(const char *buf, struct http_parser_url *u,
383 int found_at)
384 {
385 enum http_host_state s;
386 size_t buflen;
387 const char *p;
388
389 buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
390 __ASSERT_NO_MSG(u->field_set & (1 << UF_HOST));
391
392 u->field_data[UF_HOST].len = 0U;
393
394 s = found_at ? s_http_userinfo_start : s_http_host_start;
395
396 for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
397 enum http_host_state new_s = http_parse_host_char(s, *p);
398
399 if (new_s == s_http_host_dead) {
400 return 1;
401 }
402
403 switch (new_s) {
404 case s_http_host:
405 if (s != s_http_host) {
406 u->field_data[UF_HOST].off = p - buf;
407 }
408 u->field_data[UF_HOST].len++;
409 break;
410
411 case s_http_host_v6:
412 if (s != s_http_host_v6) {
413 u->field_data[UF_HOST].off = p - buf;
414 }
415 u->field_data[UF_HOST].len++;
416 break;
417
418 case s_http_host_v6_zone_start:
419 case s_http_host_v6_zone:
420 u->field_data[UF_HOST].len++;
421 break;
422
423 case s_http_host_port:
424 if (s != s_http_host_port) {
425 u->field_data[UF_PORT].off = p - buf;
426 u->field_data[UF_PORT].len = 0U;
427 u->field_set |= (1 << UF_PORT);
428 }
429 u->field_data[UF_PORT].len++;
430 break;
431
432 case s_http_userinfo:
433 if (s != s_http_userinfo) {
434 u->field_data[UF_USERINFO].off = p - buf;
435 u->field_data[UF_USERINFO].len = 0U;
436 u->field_set |= (1 << UF_USERINFO);
437 }
438 u->field_data[UF_USERINFO].len++;
439 break;
440
441 default:
442 break;
443 }
444 s = new_s;
445 }
446
447 /* Make sure we don't end somewhere unexpected */
448 switch (s) {
449 case s_http_host_start:
450 case s_http_host_v6_start:
451 case s_http_host_v6:
452 case s_http_host_v6_zone_start:
453 case s_http_host_v6_zone:
454 case s_http_host_port_start:
455 case s_http_userinfo:
456 case s_http_userinfo_start:
457 return 1;
458 default:
459 break;
460 }
461
462 return 0;
463 }
464
465 void
http_parser_url_init(struct http_parser_url * u)466 http_parser_url_init(struct http_parser_url *u)
467 {
468 (void)memset(u, 0, sizeof(*u));
469 }
470
471 int
http_parser_parse_url(const char * buf,size_t buflen,int is_connect,struct http_parser_url * u)472 http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
473 struct http_parser_url *u)
474 {
475 enum http_parser_url_fields old_uf;
476 enum http_parser_url_fields uf;
477 int found_at = 0;
478 const char *p;
479 enum state s;
480
481 u->port = u->field_set = 0U;
482 s = is_connect ? s_req_server_start : s_req_spaces_before_url;
483 old_uf = UF_MAX;
484
485 for (p = buf; p < buf + buflen; p++) {
486 s = parse_url_char(s, *p);
487
488 /* Figure out the next field that we're operating on */
489 switch (s) {
490 case s_dead:
491 return 1;
492
493 /* Skip delimiters */
494 case s_req_schema_slash:
495 case s_req_schema_slash_slash:
496 case s_req_server_start:
497 case s_req_query_string_start:
498 case s_req_fragment_start:
499 continue;
500
501 case s_req_schema:
502 uf = UF_SCHEMA;
503 break;
504
505 case s_req_server_with_at:
506 found_at = 1;
507 __fallthrough;
508
509 case s_req_server:
510 uf = UF_HOST;
511 break;
512
513 case s_req_path:
514 uf = UF_PATH;
515 break;
516
517 case s_req_query_string:
518 uf = UF_QUERY;
519 break;
520
521 case s_req_fragment:
522 uf = UF_FRAGMENT;
523 break;
524
525 default:
526 __ASSERT_NO_MSG(!"Unexpected state");
527 return 1;
528 }
529
530 /* Nothing's changed; soldier on */
531 if (uf == old_uf) {
532 u->field_data[uf].len++;
533 continue;
534 }
535
536 u->field_data[uf].off = p - buf;
537 u->field_data[uf].len = 1U;
538
539 u->field_set |= (1 << uf);
540 old_uf = uf;
541 }
542
543 /* host must be present if there is a schema */
544 /* parsing http:///toto will fail */
545 if ((u->field_set & (1 << UF_SCHEMA)) &&
546 (u->field_set & (1 << UF_HOST)) == 0U) {
547 return 1;
548 }
549
550 if (u->field_set & (1 << UF_HOST)) {
551 if (http_parse_host(buf, u, found_at) != 0) {
552 return 1;
553 }
554 }
555
556 /* CONNECT requests can only contain "hostname:port" */
557 if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
558 return 1;
559 }
560
561 if (u->field_set & (1 << UF_PORT)) {
562 /* Don't bother with endp; we've already validated the string */
563 unsigned long v;
564
565 v = strtoul(buf + u->field_data[UF_PORT].off, NULL, 10);
566
567 /* Ports have a max value of 2^16 */
568 if (v > 0xffff) {
569 return 1;
570 }
571
572 u->port = (uint16_t) v;
573 }
574
575 return 0;
576 }
577