1 /* SPDX-License-Identifier: MIT */
2 
3 /* Based on src/http/ngx_http_parse.c from NGINX copyright Igor Sysoev
4  *
5  * Additional changes are licensed under the same terms as NGINX and
6  * copyright Joyent, Inc. and other Node contributors. All rights reserved.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to
10  * deal in the Software without restriction, including without limitation the
11  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
12  * sell copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24  * IN THE SOFTWARE.
25  */
26 #include <sys/__assert.h>
27 #include <stddef.h>
28 #include <ctype.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <limits.h>
32 #include <net/http_parser_url.h>
33 #include <toolchain.h>
34 
35 #ifndef BIT_AT
36 # define BIT_AT(a, i)                                                \
37 	(!!((unsigned int) (a)[(unsigned int) (i) >> 3] &                  \
38 	 (1 << ((unsigned int) (i) & 7))))
39 #endif
40 
41 /* Set the mark FOR; non-destructive if mark is already set */
42 #define MARK(FOR)                                                          \
43 do {                                                                       \
44 	if (!FOR##_mark) {                                                 \
45 		FOR##_mark = p;                                            \
46 	}                                                                  \
47 } while (0)
48 
49 
50 #ifdef HTTP_PARSER_STRICT
51 # define T(v) 0
52 #else
53 # define T(v) v
54 #endif
55 
56 
57 static const uint8_t normal_url_char[32] = {
58 /*   0 nul    1 soh    2 stx    3 etx    4 eot    5 enq    6 ack    7 bel  */
59 	0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
60 /*   8 bs     9 ht    10 nl    11 vt    12 np    13 cr    14 so    15 si   */
61 	0    | T(2)   |   0    |   0    | T(16)  |   0    |   0    |   0,
62 /*  16 dle   17 dc1   18 dc2   19 dc3   20 dc4   21 nak   22 syn   23 etb */
63 	0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
64 /*  24 can   25 em    26 sub   27 esc   28 fs    29 gs    30 rs    31 us  */
65 	0    |   0    |   0    |   0    |   0    |   0    |   0    |   0,
66 /*  32 sp    33  !    34  "    35  #    36  $    37  %    38  &    39  '  */
67 	0    |   2    |   4    |   0    |   16   |   32   |   64   |  128,
68 /*  40  (    41  )    42  *    43  +    44  ,    45  -    46  .    47  /  */
69 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
70 /*  48  0    49  1    50  2    51  3    52  4    53  5    54  6    55  7  */
71 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
72 /*  56  8    57  9    58  :    59  ;    60  <    61  =    62  >    63  ?  */
73 	1    |   2    |   4    |   8    |   16   |   32   |   64   |   0,
74 /*  64  @    65  A    66  B    67  C    68  D    69  E    70  F    71  G  */
75 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
76 /*  72  H    73  I    74  J    75  K    76  L    77  M    78  N    79  O  */
77 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
78 /*  80  P    81  Q    82  R    83  S    84  T    85  U    86  V    87  W  */
79 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
80 /*  88  X    89  Y    90  Z    91  [    92  \    93  ]    94  ^    95  _  */
81 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
82 /*  96  `    97  a    98  b    99  c   100  d   101  e   102  f   103  g  */
83 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
84 /* 104  h   105  i   106  j   107  k   108  l   109  m   110  n   111  o  */
85 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
86 /* 112  p   113  q   114  r   115  s   116  t   117  u   118  v   119  w  */
87 	1    |   2    |   4    |   8    |   16   |   32   |   64   |  128,
88 /* 120  x   121  y   122  z   123  {   124  |   125  }   126  ~   127 del */
89 	1    |   2    |   4    |   8    |   16   |   32   |   64   |   0,
90 };
91 
92 #undef T
93 
94 enum http_host_state {
95 	s_http_host_dead = 1,
96 	s_http_userinfo_start,
97 	s_http_userinfo,
98 	s_http_host_start,
99 	s_http_host_v6_start,
100 	s_http_host,
101 	s_http_host_v6,
102 	s_http_host_v6_end,
103 	s_http_host_v6_zone_start,
104 	s_http_host_v6_zone,
105 	s_http_host_port_start,
106 	s_http_host_port
107 };
108 
109 
110 /* Macros for character classes; depends on strict-mode  */
111 #define LOWER(c)            (unsigned char)(c | 0x20)
112 #define IS_ALPHA(c)         (LOWER(c) >= 'a' && LOWER(c) <= 'z')
113 #define IS_NUM(c)           ((c) >= '0' && (c) <= '9')
114 #define IS_ALPHANUM(c)      (IS_ALPHA(c) || IS_NUM(c))
115 #define IS_HEX(c)           (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))
116 
117 #define IS_MARK(c)		((c) == '-' || (c) == '_' || (c) == '.' || \
118 				 (c) == '!' || (c) == '~' || (c) == '*' || \
119 				 (c) == '\'' || (c) == '(' || (c) == ')')
120 
121 #define IS_USERINFO_CHAR(c) (IS_ALPHANUM(c) || IS_MARK(c) || (c) == '%' || \
122 				(c) == ';' || (c) == ':' || (c) == '&' ||  \
123 				(c) == '=' || (c) == '+' || (c) == '$' ||  \
124 				(c) == ',')
125 
126 #ifdef HTTP_PARSER_STRICT
127 #define IS_URL_CHAR(c)      (BIT_AT(normal_url_char, (unsigned char)c))
128 #define IS_HOST_CHAR(c)     (IS_ALPHANUM(c) || (c) == '.' || (c) == '-')
129 #else
130 #define IS_URL_CHAR(c)                                                         \
131 	(BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
132 #define IS_HOST_CHAR(c)                                                        \
133 	(IS_ALPHANUM(c) || (c) == '.' || (c) == '-' || (c) == '_')
134 #endif
135 
136 /* Our URL parser.
137  *
138  * This is designed to be shared by http_parser_execute() for URL validation,
139  * hence it has a state transition + byte-for-byte interface. In addition, it
140  * is meant to be embedded in http_parser_parse_url(), which does the dirty
141  * work of turning state transitions URL components for its API.
142  *
143  * This function should only be invoked with non-space characters. It is
144  * assumed that the caller cares about (and can detect) the transition between
145  * URL and non-URL states by looking for these.
146  */
parse_url_char(enum state s,const char ch)147 enum state parse_url_char(enum state s, const char ch)
148 {
149 	if (ch == ' ' || ch == '\r' || ch == '\n') {
150 		return s_dead;
151 	}
152 
153 #ifdef HTTP_PARSER_STRICT
154 	if (ch == '\t' || ch == '\f') {
155 		return s_dead;
156 	}
157 #endif
158 
159 	switch (s) {
160 	case s_req_spaces_before_url:
161 		/* Proxied requests are followed by scheme of an absolute URI
162 		 * (alpha).
163 		 * All methods except CONNECT are followed by '/' or '*'.
164 		 */
165 
166 		if (ch == '/' || ch == '*') {
167 			return s_req_path;
168 		}
169 
170 		if (IS_ALPHA(ch)) {
171 			return s_req_schema;
172 		}
173 
174 		break;
175 
176 	case s_req_schema:
177 		if (IS_ALPHA(ch)) {
178 			return s;
179 		}
180 
181 		if (ch == ':') {
182 			return s_req_schema_slash;
183 		}
184 
185 		break;
186 
187 	case s_req_schema_slash:
188 		if (ch == '/') {
189 			return s_req_schema_slash_slash;
190 		}
191 
192 		break;
193 
194 	case s_req_schema_slash_slash:
195 		if (ch == '/') {
196 			return s_req_server_start;
197 		}
198 
199 		break;
200 
201 	case s_req_server_with_at:
202 		if (ch == '@') {
203 			return s_dead;
204 		}
205 
206 		__fallthrough;
207 	case s_req_server_start:
208 	case s_req_server:
209 		if (ch == '/') {
210 			return s_req_path;
211 		}
212 
213 		if (ch == '?') {
214 			return s_req_query_string_start;
215 		}
216 
217 		if (ch == '@') {
218 			return s_req_server_with_at;
219 		}
220 
221 		if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
222 			return s_req_server;
223 		}
224 
225 		break;
226 
227 	case s_req_path:
228 		if (IS_URL_CHAR(ch)) {
229 			return s;
230 		}
231 
232 		switch (ch) {
233 		case '?':
234 			return s_req_query_string_start;
235 
236 		case '#':
237 			return s_req_fragment_start;
238 		}
239 
240 		break;
241 
242 	case s_req_query_string_start:
243 	case s_req_query_string:
244 		if (IS_URL_CHAR(ch)) {
245 			return s_req_query_string;
246 		}
247 
248 		switch (ch) {
249 		case '?':
250 			/* allow extra '?' in query string */
251 			return s_req_query_string;
252 
253 		case '#':
254 			return s_req_fragment_start;
255 		}
256 
257 		break;
258 
259 	case s_req_fragment_start:
260 		if (IS_URL_CHAR(ch)) {
261 			return s_req_fragment;
262 		}
263 
264 		switch (ch) {
265 		case '?':
266 			return s_req_fragment;
267 
268 		case '#':
269 			return s;
270 		}
271 
272 		break;
273 
274 	case s_req_fragment:
275 		if (IS_URL_CHAR(ch)) {
276 			return s;
277 		}
278 
279 		switch (ch) {
280 		case '?':
281 		case '#':
282 			return s;
283 		}
284 
285 		break;
286 
287 	default:
288 		break;
289 	}
290 
291 	/* We should never fall out of the switch above unless there's
292 	 * an error
293 	 */
294 	return s_dead;
295 }
296 
297 static enum http_host_state
http_parse_host_char(enum http_host_state s,const char ch)298 http_parse_host_char(enum http_host_state s, const char ch)
299 {
300 	switch (s) {
301 	case s_http_userinfo:
302 	case s_http_userinfo_start:
303 		if (ch == '@') {
304 			return s_http_host_start;
305 		}
306 
307 		if (IS_USERINFO_CHAR(ch)) {
308 			return s_http_userinfo;
309 		}
310 		break;
311 
312 	case s_http_host_start:
313 		if (ch == '[') {
314 			return s_http_host_v6_start;
315 		}
316 
317 		if (IS_HOST_CHAR(ch)) {
318 			return s_http_host;
319 		}
320 
321 		break;
322 
323 	case s_http_host:
324 		if (IS_HOST_CHAR(ch)) {
325 			return s_http_host;
326 		}
327 
328 		__fallthrough;
329 	case s_http_host_v6_end:
330 		if (ch == ':') {
331 			return s_http_host_port_start;
332 		}
333 
334 		break;
335 
336 	case s_http_host_v6:
337 		if (ch == ']') {
338 			return s_http_host_v6_end;
339 		}
340 
341 		__fallthrough;
342 	case s_http_host_v6_start:
343 		if (IS_HEX(ch) || ch == ':' || ch == '.') {
344 			return s_http_host_v6;
345 		}
346 
347 		if (s == s_http_host_v6 && ch == '%') {
348 			return s_http_host_v6_zone_start;
349 		}
350 		break;
351 
352 	case s_http_host_v6_zone:
353 		if (ch == ']') {
354 			return s_http_host_v6_end;
355 		}
356 
357 		__fallthrough;
358 	case s_http_host_v6_zone_start:
359 		/* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
360 		if (IS_ALPHANUM(ch) || ch == '%' || ch == '.' || ch == '-' ||
361 				ch == '_' ||
362 				ch == '~') {
363 			return s_http_host_v6_zone;
364 		}
365 		break;
366 
367 	case s_http_host_port:
368 	case s_http_host_port_start:
369 		if (IS_NUM(ch)) {
370 			return s_http_host_port;
371 		}
372 
373 		break;
374 
375 	default:
376 		break;
377 	}
378 	return s_http_host_dead;
379 }
380 
381 static
http_parse_host(const char * buf,struct http_parser_url * u,int found_at)382 int http_parse_host(const char *buf, struct http_parser_url *u,
383 		    int found_at)
384 {
385 	enum http_host_state s;
386 	size_t buflen;
387 	const char *p;
388 
389 	buflen = u->field_data[UF_HOST].off + u->field_data[UF_HOST].len;
390 	__ASSERT_NO_MSG(u->field_set & (1 << UF_HOST));
391 
392 	u->field_data[UF_HOST].len = 0U;
393 
394 	s = found_at ? s_http_userinfo_start : s_http_host_start;
395 
396 	for (p = buf + u->field_data[UF_HOST].off; p < buf + buflen; p++) {
397 		enum http_host_state new_s = http_parse_host_char(s, *p);
398 
399 		if (new_s == s_http_host_dead) {
400 			return 1;
401 		}
402 
403 		switch (new_s) {
404 		case s_http_host:
405 			if (s != s_http_host) {
406 				u->field_data[UF_HOST].off = p - buf;
407 			}
408 			u->field_data[UF_HOST].len++;
409 			break;
410 
411 		case s_http_host_v6:
412 			if (s != s_http_host_v6) {
413 				u->field_data[UF_HOST].off = p - buf;
414 			}
415 			u->field_data[UF_HOST].len++;
416 			break;
417 
418 		case s_http_host_v6_zone_start:
419 		case s_http_host_v6_zone:
420 			u->field_data[UF_HOST].len++;
421 			break;
422 
423 		case s_http_host_port:
424 			if (s != s_http_host_port) {
425 				u->field_data[UF_PORT].off = p - buf;
426 				u->field_data[UF_PORT].len = 0U;
427 				u->field_set |= (1 << UF_PORT);
428 			}
429 			u->field_data[UF_PORT].len++;
430 			break;
431 
432 		case s_http_userinfo:
433 			if (s != s_http_userinfo) {
434 				u->field_data[UF_USERINFO].off = p - buf;
435 				u->field_data[UF_USERINFO].len = 0U;
436 				u->field_set |= (1 << UF_USERINFO);
437 			}
438 			u->field_data[UF_USERINFO].len++;
439 			break;
440 
441 		default:
442 			break;
443 		}
444 		s = new_s;
445 	}
446 
447 	/* Make sure we don't end somewhere unexpected */
448 	switch (s) {
449 	case s_http_host_start:
450 	case s_http_host_v6_start:
451 	case s_http_host_v6:
452 	case s_http_host_v6_zone_start:
453 	case s_http_host_v6_zone:
454 	case s_http_host_port_start:
455 	case s_http_userinfo:
456 	case s_http_userinfo_start:
457 		return 1;
458 	default:
459 		break;
460 	}
461 
462 	return 0;
463 }
464 
465 void
http_parser_url_init(struct http_parser_url * u)466 http_parser_url_init(struct http_parser_url *u)
467 {
468 	(void)memset(u, 0, sizeof(*u));
469 }
470 
471 int
http_parser_parse_url(const char * buf,size_t buflen,int is_connect,struct http_parser_url * u)472 http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
473 		      struct http_parser_url *u)
474 {
475 	enum http_parser_url_fields old_uf;
476 	enum http_parser_url_fields uf;
477 	int found_at = 0;
478 	const char *p;
479 	enum state s;
480 
481 	u->port = u->field_set = 0U;
482 	s = is_connect ? s_req_server_start : s_req_spaces_before_url;
483 	old_uf = UF_MAX;
484 
485 	for (p = buf; p < buf + buflen; p++) {
486 		s = parse_url_char(s, *p);
487 
488 		/* Figure out the next field that we're operating on */
489 		switch (s) {
490 		case s_dead:
491 			return 1;
492 
493 		/* Skip delimeters */
494 		case s_req_schema_slash:
495 		case s_req_schema_slash_slash:
496 		case s_req_server_start:
497 		case s_req_query_string_start:
498 		case s_req_fragment_start:
499 			continue;
500 
501 		case s_req_schema:
502 			uf = UF_SCHEMA;
503 			break;
504 
505 		case s_req_server_with_at:
506 			found_at = 1;
507 			__fallthrough;
508 
509 		case s_req_server:
510 			uf = UF_HOST;
511 			break;
512 
513 		case s_req_path:
514 			uf = UF_PATH;
515 			break;
516 
517 		case s_req_query_string:
518 			uf = UF_QUERY;
519 			break;
520 
521 		case s_req_fragment:
522 			uf = UF_FRAGMENT;
523 			break;
524 
525 		default:
526 			__ASSERT_NO_MSG(!"Unexpected state");
527 			return 1;
528 		}
529 
530 		/* Nothing's changed; soldier on */
531 		if (uf == old_uf) {
532 			u->field_data[uf].len++;
533 			continue;
534 		}
535 
536 		u->field_data[uf].off = p - buf;
537 		u->field_data[uf].len = 1U;
538 
539 		u->field_set |= (1 << uf);
540 		old_uf = uf;
541 	}
542 
543 	/* host must be present if there is a schema */
544 	/* parsing http:///toto will fail */
545 	if ((u->field_set & (1 << UF_SCHEMA)) &&
546 			(u->field_set & (1 << UF_HOST)) == 0U) {
547 		return 1;
548 	}
549 
550 	if (u->field_set & (1 << UF_HOST)) {
551 		if (http_parse_host(buf, u, found_at) != 0) {
552 			return 1;
553 		}
554 	}
555 
556 	/* CONNECT requests can only contain "hostname:port" */
557 	if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
558 		return 1;
559 	}
560 
561 	if (u->field_set & (1 << UF_PORT)) {
562 		/* Don't bother with endp; we've already validated the string */
563 		unsigned long v;
564 
565 		v = strtoul(buf + u->field_data[UF_PORT].off, NULL, 10);
566 
567 		/* Ports have a max value of 2^16 */
568 		if (v > 0xffff) {
569 			return 1;
570 		}
571 
572 		u->port = (uint16_t) v;
573 	}
574 
575 	return 0;
576 }
577