1 /*
2  *  Regexp compilation.
3  *
4  *  See doc/regexp.rst for a discussion of the compilation approach and
5  *  current limitations.
6  *
7  *  Regexp bytecode assumes jumps can be expressed with signed 32-bit
8  *  integers.  Consequently the bytecode size must not exceed 0x7fffffffL.
9  *  The implementation casts duk_size_t (buffer size) to duk_(u)int32_t
10  *  in many places.  Although this could be changed, the bytecode format
11  *  limit would still prevent regexps exceeding the signed 32-bit limit
12  *  from working.
13  *
14  *  XXX: The implementation does not prevent bytecode from exceeding the
15  *  maximum supported size.  This could be done by limiting the maximum
16  *  input string size (assuming an upper bound can be computed for number
17  *  of bytecode bytes emitted per input byte) or checking buffer maximum
18  *  size when emitting bytecode (slower).
19  */
20 
21 #include "duk_internal.h"
22 
23 #ifdef DUK_USE_REGEXP_SUPPORT
24 
25 /*
26  *  Helper macros
27  */
28 
29 #define DUK__RE_INITIAL_BUFSIZE 64
30 
31 #undef DUK__RE_BUFLEN
32 #define DUK__RE_BUFLEN(re_ctx) \
33 	DUK_BW_GET_SIZE(re_ctx->thr, &re_ctx->bw)
34 
35 /*
36  *  Disjunction struct: result of parsing a disjunction
37  */
38 
39 typedef struct {
40 	/* Number of characters that the atom matches (e.g. 3 for 'abc'),
41 	 * -1 if atom is complex and number of matched characters either
42 	 * varies or is not known.
43 	 */
44 	duk_int32_t charlen;
45 
46 #if 0
47 	/* These are not needed to implement quantifier capture handling,
48 	 * but might be needed at some point.
49 	 */
50 
51 	/* re_ctx->captures at start and end of atom parsing.
52 	 * Since 'captures' indicates highest capture number emitted
53 	 * so far in a DUK_REOP_SAVE, the captures numbers saved by
54 	 * the atom are: ]start_captures,end_captures].
55 	 */
56 	duk_uint32_t start_captures;
57 	duk_uint32_t end_captures;
58 #endif
59 } duk__re_disjunction_info;
60 
61 /*
62  *  Encoding helpers
63  *
64  *  Some of the typing is bytecode based, e.g. slice sizes are unsigned 32-bit
65  *  even though the buffer operations will use duk_size_t.
66  */
67 
68 /* XXX: the insert helpers should ensure that the bytecode result is not
69  * larger than expected (or at least assert for it).  Many things in the
70  * bytecode, like skip offsets, won't work correctly if the bytecode is
71  * larger than say 2G.
72  */
73 
duk__encode_i32(duk_int32_t x)74 DUK_LOCAL duk_uint32_t duk__encode_i32(duk_int32_t x) {
75 	if (x < 0) {
76 		return ((duk_uint32_t) (-x)) * 2 + 1;
77 	} else {
78 		return ((duk_uint32_t) x) * 2;
79 	}
80 }
81 
82 /* XXX: return type should probably be duk_size_t, or explicit checks are needed for
83  * maximum size.
84  */
duk__insert_u32(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_uint32_t x)85 DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) {
86 	duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
87 	duk_small_int_t len;
88 
89 	len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
90 	DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, len);
91 	return (duk_uint32_t) len;
92 }
93 
duk__append_u32(duk_re_compiler_ctx * re_ctx,duk_uint32_t x)94 DUK_LOCAL duk_uint32_t duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
95 	duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
96 	duk_small_int_t len;
97 
98 	len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
99 	DUK_BW_WRITE_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, buf, len);
100 	return (duk_uint32_t) len;
101 }
102 
duk__insert_i32(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_int32_t x)103 DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) {
104 	return duk__insert_u32(re_ctx, offset, duk__encode_i32(x));
105 }
106 
107 #if 0  /* unused */
108 DUK_LOCAL duk_uint32_t duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) {
109 	return duk__append_u32(re_ctx, duk__encode_i32(x));
110 }
111 #endif
112 
113 /* special helper for emitting u16 lists (used for character ranges for built-in char classes) */
duk__append_u16_list(duk_re_compiler_ctx * re_ctx,const duk_uint16_t * values,duk_uint32_t count)114 DUK_LOCAL void duk__append_u16_list(duk_re_compiler_ctx *re_ctx, const duk_uint16_t *values, duk_uint32_t count) {
115 	/* Call sites don't need the result length so it's not accumulated. */
116 	while (count > 0) {
117 		(void) duk__append_u32(re_ctx, (duk_uint32_t) (*values++));
118 		count--;
119 	}
120 }
121 
duk__insert_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_uint32_t data_offset,duk_uint32_t data_length)122 DUK_LOCAL void duk__insert_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t data_offset, duk_uint32_t data_length) {
123 	DUK_BW_INSERT_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, offset, data_offset, data_length);
124 }
125 
duk__append_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t data_offset,duk_uint32_t data_length)126 DUK_LOCAL void duk__append_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
127 	DUK_BW_WRITE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
128 }
129 
duk__remove_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t data_offset,duk_uint32_t data_length)130 DUK_LOCAL void duk__remove_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
131 	DUK_BW_REMOVE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
132 }
133 
134 /*
135  *  Insert a jump offset at 'offset' to complete an instruction
136  *  (the jump offset is always the last component of an instruction).
137  *  The 'skip' argument must be computed relative to 'offset',
138  *  -without- taking into account the skip field being inserted.
139  *
140  *       ... A B C ins X Y Z ...   (ins may be a JUMP, SPLIT1/SPLIT2, etc)
141  *   =>  ... A B C ins SKIP X Y Z
142  *
143  *  Computing the final (adjusted) skip value, which is relative to the
144  *  first byte of the next instruction, is a bit tricky because of the
145  *  variable length UTF-8 encoding.  See doc/regexp.rst for discussion.
146  */
duk__insert_jump_offset(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_int32_t skip)147 DUK_LOCAL duk_uint32_t duk__insert_jump_offset(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t skip) {
148 	duk_small_int_t len;
149 
150 	/* XXX: solve into closed form (smaller code) */
151 
152 	if (skip < 0) {
153 		/* two encoding attempts suffices */
154 		len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip));
155 		len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip - (duk_int32_t) len));
156 		DUK_ASSERT(duk_unicode_get_xutf8_length(duk__encode_i32(skip - (duk_int32_t) len)) == len);  /* no change */
157 		skip -= (duk_int32_t) len;
158 	}
159 	return duk__insert_i32(re_ctx, offset, skip);
160 }
161 
duk__append_jump_offset(duk_re_compiler_ctx * re_ctx,duk_int32_t skip)162 DUK_LOCAL duk_uint32_t duk__append_jump_offset(duk_re_compiler_ctx *re_ctx, duk_int32_t skip) {
163 	return (duk_uint32_t) duk__insert_jump_offset(re_ctx, (duk_uint32_t) DUK__RE_BUFLEN(re_ctx), skip);
164 }
165 
166 /*
167  *  duk_re_range_callback for generating character class ranges.
168  *
169  *  When ignoreCase is false, the range is simply emitted as is.
170  *  We don't, for instance, eliminate duplicates or overlapping
171  *  ranges in a character class.
172  *
173  *  When ignoreCase is true, the range needs to be normalized through
174  *  canonicalization.  Unfortunately a canonicalized version of a
175  *  continuous range is not necessarily continuous (e.g. [x-{] is
176  *  continuous but [X-{] is not).  The current algorithm creates the
177  *  canonicalized range(s) space efficiently at the cost of compile
178  *  time execution time (see doc/regexp.rst for discussion).
179  *
180  *  Note that the ctx->nranges is a context-wide temporary value
181  *  (this is OK because there cannot be multiple character classes
182  *  being parsed simultaneously).
183  */
184 
duk__generate_ranges(void * userdata,duk_codepoint_t r1,duk_codepoint_t r2,duk_bool_t direct)185 DUK_LOCAL void duk__generate_ranges(void *userdata, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct) {
186 	duk_re_compiler_ctx *re_ctx = (duk_re_compiler_ctx *) userdata;
187 
188 	DUK_DD(DUK_DDPRINT("duk__generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld",
189 	                   (void *) re_ctx, (long) r1, (long) r2, (long) direct));
190 
191 	if (!direct && (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE)) {
192 		/*
193 		 *  Canonicalize a range, generating result ranges as necessary.
194 		 *  Needs to exhaustively scan the entire range (at most 65536
195 		 *  code points).  If 'direct' is set, caller (lexer) has ensured
196 		 *  that the range is already canonicalization compatible (this
197 		 *  is used to avoid unnecessary canonicalization of built-in
198 		 *  ranges like \W, which are not affected by canonicalization).
199 		 *
200 		 *  NOTE: here is one place where we don't want to support chars
201 		 *  outside the BMP, because the exhaustive search would be
202 		 *  massively larger.
203 		 */
204 
205 		duk_codepoint_t i;
206 		duk_codepoint_t t;
207 		duk_codepoint_t r_start, r_end;
208 
209 		r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
210 		r_end = r_start;
211 		for (i = r1 + 1; i <= r2; i++) {
212 			t = duk_unicode_re_canonicalize_char(re_ctx->thr, i);
213 			if (t == r_end + 1) {
214 				r_end = t;
215 			} else {
216 				DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
217 				duk__append_u32(re_ctx, (duk_uint32_t) r_start);
218 				duk__append_u32(re_ctx, (duk_uint32_t) r_end);
219 				re_ctx->nranges++;
220 				r_start = t;
221 				r_end = t;
222 			}
223 		}
224 		DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
225 		duk__append_u32(re_ctx, (duk_uint32_t) r_start);
226 		duk__append_u32(re_ctx, (duk_uint32_t) r_end);
227 		re_ctx->nranges++;
228 	} else {
229 		DUK_DD(DUK_DDPRINT("direct, emit range: [%ld,%ld]", (long) r1, (long) r2));
230 		duk__append_u32(re_ctx, (duk_uint32_t) r1);
231 		duk__append_u32(re_ctx, (duk_uint32_t) r2);
232 		re_ctx->nranges++;
233 	}
234 }
235 
236 /*
237  *  Parse regexp Disjunction.  Most of regexp compilation happens here.
238  *
239  *  Handles Disjunction, Alternative, and Term productions directly without
240  *  recursion.  The only constructs requiring recursion are positive/negative
241  *  lookaheads, capturing parentheses, and non-capturing parentheses.
242  *
243  *  The function determines whether the entire disjunction is a 'simple atom'
244  *  (see doc/regexp.rst discussion on 'simple quantifiers') and if so,
245  *  returns the atom character length which is needed by the caller to keep
246  *  track of its own atom character length.  A disjunction with more than one
247  *  alternative is never considered a simple atom (although in some cases
248  *  that might be the case).
249  *
250  *  Return value: simple atom character length or < 0 if not a simple atom.
251  *  Appends the bytecode for the disjunction matcher to the end of the temp
252  *  buffer.
253  *
254  *  Regexp top level structure is:
255  *
256  *    Disjunction = Term*
257  *                | Term* | Disjunction
258  *
259  *    Term = Assertion
260  *         | Atom
261  *         | Atom Quantifier
262  *
263  *  An empty Term sequence is a valid disjunction alternative (e.g. /|||c||/).
264  *
265  *  Notes:
266  *
267  *    * Tracking of the 'simple-ness' of the current atom vs. the entire
268  *      disjunction are separate matters.  For instance, the disjunction
269  *      may be complex, but individual atoms may be simple.  Furthermore,
270  *      simple quantifiers are used whenever possible, even if the
271  *      disjunction as a whole is complex.
272  *
273  *    * The estimate of whether an atom is simple is conservative now,
274  *      and it would be possible to expand it.  For instance, captures
275  *      cause the disjunction to be marked complex, even though captures
276  *      -can- be handled by simple quantifiers with some minor modifications.
277  *
278  *    * Disjunction 'tainting' as 'complex' is handled at the end of the
279  *      main for loop collectively for atoms.  Assertions, quantifiers,
280  *      and '|' tokens need to taint the result manually if necessary.
281  *      Assertions cannot add to result char length, only atoms (and
282  *      quantifiers) can; currently quantifiers will taint the result
283  *      as complex though.
284  */
285 
duk__parse_disjunction(duk_re_compiler_ctx * re_ctx,duk_bool_t expect_eof,duk__re_disjunction_info * out_atom_info)286 DUK_LOCAL void duk__parse_disjunction(duk_re_compiler_ctx *re_ctx, duk_bool_t expect_eof, duk__re_disjunction_info *out_atom_info) {
287 	duk_int32_t atom_start_offset = -1;                   /* negative -> no atom matched on previous round */
288 	duk_int32_t atom_char_length = 0;                     /* negative -> complex atom */
289 	duk_uint32_t atom_start_captures = re_ctx->captures;  /* value of re_ctx->captures at start of atom */
290 	duk_int32_t unpatched_disjunction_split = -1;
291 	duk_int32_t unpatched_disjunction_jump = -1;
292 	duk_uint32_t entry_offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
293 	duk_int32_t res_charlen = 0;  /* -1 if disjunction is complex, char length if simple */
294 	duk__re_disjunction_info tmp_disj;
295 
296 	DUK_ASSERT(out_atom_info != NULL);
297 
298 	if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
299 		DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT);
300 	}
301 	re_ctx->recursion_depth++;
302 
303 #if 0
304 	out_atom_info->start_captures = re_ctx->captures;
305 #endif
306 
307 	for (;;) {
308 		/* atom_char_length, atom_start_offset, atom_start_offset reflect the
309 		 * atom matched on the previous loop.  If a quantifier is encountered
310 		 * on this loop, these are needed to handle the quantifier correctly.
311 		 * new_atom_char_length etc are for the atom parsed on this round;
312 		 * they're written to atom_char_length etc at the end of the round.
313 		 */
314 		duk_int32_t new_atom_char_length;   /* char length of the atom parsed in this loop */
315 		duk_int32_t new_atom_start_offset;  /* bytecode start offset of the atom parsed in this loop
316 		                                     * (allows quantifiers to copy the atom bytecode)
317 		                                     */
318 		duk_uint32_t new_atom_start_captures;  /* re_ctx->captures at the start of the atom parsed in this loop */
319 
320 		duk_lexer_parse_re_token(&re_ctx->lex, &re_ctx->curr_token);
321 
322 		DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)",
323 		                   (long) re_ctx->curr_token.t,
324 		                   (long) re_ctx->curr_token.num,
325 		                   (re_ctx->curr_token.num >= 0x20 && re_ctx->curr_token.num <= 0x7e) ?
326 		                   (int) re_ctx->curr_token.num : (int) '?'));
327 
328 		/* set by atom case clauses */
329 		new_atom_start_offset = -1;
330 		new_atom_char_length = -1;
331 		new_atom_start_captures = re_ctx->captures;
332 
333 		switch (re_ctx->curr_token.t) {
334 		case DUK_RETOK_DISJUNCTION: {
335 			/*
336 			 *  The handling here is a bit tricky.  If a previous '|' has been processed,
337 			 *  we have a pending split1 and a pending jump (for a previous match).  These
338 			 *  need to be back-patched carefully.  See docs for a detailed example.
339 			 */
340 
341 			/* patch pending jump and split */
342 			if (unpatched_disjunction_jump >= 0) {
343 				duk_uint32_t offset;
344 
345 				DUK_ASSERT(unpatched_disjunction_split >= 0);
346 				offset = unpatched_disjunction_jump;
347 				offset += duk__insert_jump_offset(re_ctx,
348 				                                  offset,
349 				                                  (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
350 				/* offset is now target of the pending split (right after jump) */
351 				duk__insert_jump_offset(re_ctx,
352 				                        unpatched_disjunction_split,
353 				                        offset - unpatched_disjunction_split);
354 			}
355 
356 			/* add a new pending split to the beginning of the entire disjunction */
357 			(void) duk__insert_u32(re_ctx,
358 			                       entry_offset,
359 			                       DUK_REOP_SPLIT1);   /* prefer direct execution */
360 			unpatched_disjunction_split = entry_offset + 1;   /* +1 for opcode */
361 
362 			/* add a new pending match jump for latest finished alternative */
363 			duk__append_u32(re_ctx, DUK_REOP_JUMP);
364 			unpatched_disjunction_jump = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
365 
366 			/* 'taint' result as complex */
367 			res_charlen = -1;
368 			break;
369 		}
370 		case DUK_RETOK_QUANTIFIER: {
371 			if (atom_start_offset < 0) {
372 				DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_NO_ATOM);
373 			}
374 			if (re_ctx->curr_token.qmin > re_ctx->curr_token.qmax) {
375 				DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_VALUES);
376 			}
377 			if (atom_char_length >= 0) {
378 				/*
379 				 *  Simple atom
380 				 *
381 				 *  If atom_char_length is zero, we'll have unbounded execution time for e.g.
382 				 *  /()*x/.exec('x').  We can't just skip the match because it might have some
383 				 *  side effects (for instance, if we allowed captures in simple atoms, the
384 				 *  capture needs to happen).  The simple solution below is to force the
385 				 *  quantifier to match at most once, since the additional matches have no effect.
386 				 *
387 				 *  With a simple atom there can be no capture groups, so no captures need
388 				 *  to be reset.
389 				 */
390 				duk_int32_t atom_code_length;
391 				duk_uint32_t offset;
392 				duk_uint32_t qmin, qmax;
393 
394 				qmin = re_ctx->curr_token.qmin;
395 				qmax = re_ctx->curr_token.qmax;
396 				if (atom_char_length == 0) {
397 					/* qmin and qmax will be 0 or 1 */
398 					if (qmin > 1) {
399 						qmin = 1;
400 					}
401 					if (qmax > 1) {
402 						qmax = 1;
403 					}
404 				}
405 
406 				duk__append_u32(re_ctx, DUK_REOP_MATCH);   /* complete 'sub atom' */
407 				atom_code_length = (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - atom_start_offset);
408 
409 				offset = atom_start_offset;
410 				if (re_ctx->curr_token.greedy) {
411 					offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQGREEDY);
412 					offset += duk__insert_u32(re_ctx, offset, qmin);
413 					offset += duk__insert_u32(re_ctx, offset, qmax);
414 					offset += duk__insert_u32(re_ctx, offset, atom_char_length);
415 					offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
416 				} else {
417 					offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQMINIMAL);
418 					offset += duk__insert_u32(re_ctx, offset, qmin);
419 					offset += duk__insert_u32(re_ctx, offset, qmax);
420 					offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
421 				}
422 				DUK_UNREF(offset);  /* silence scan-build warning */
423 			} else {
424 				/*
425 				 *  Complex atom
426 				 *
427 				 *  The original code is used as a template, and removed at the end
428 				 *  (this differs from the handling of simple quantifiers).
429 				 *
430 				 *  NOTE: there is no current solution for empty atoms in complex
431 				 *  quantifiers.  This would need some sort of a 'progress' instruction.
432 				 *
433 				 *  XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies?
434 				 */
435 				duk_int32_t atom_code_length;
436 				duk_uint32_t atom_copies;
437 				duk_uint32_t tmp_qmin, tmp_qmax;
438 
439 				/* pre-check how many atom copies we're willing to make (atom_copies not needed below) */
440 				atom_copies = (re_ctx->curr_token.qmax == DUK_RE_QUANTIFIER_INFINITE) ?
441 				              re_ctx->curr_token.qmin : re_ctx->curr_token.qmax;
442 				if (atom_copies > DUK_RE_MAX_ATOM_COPIES) {
443 					DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_QUANTIFIER_TOO_MANY_COPIES);
444 				}
445 
446 				/* wipe the capture range made by the atom (if any) */
447 				DUK_ASSERT(atom_start_captures <= re_ctx->captures);
448 				if (atom_start_captures != re_ctx->captures) {
449 					DUK_ASSERT(atom_start_captures < re_ctx->captures);
450 					DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]",
451 					                     (long) atom_start_captures, (long) re_ctx->captures));
452 
453 					/* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */
454 					duk__insert_u32(re_ctx, atom_start_offset, (re_ctx->captures - atom_start_captures) * 2);
455 					duk__insert_u32(re_ctx, atom_start_offset, (atom_start_captures + 1) * 2);
456 					duk__insert_u32(re_ctx, atom_start_offset, DUK_REOP_WIPERANGE);
457 				} else {
458 					DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld",
459 					                     (long) atom_start_captures));
460 				}
461 
462 				atom_code_length = (duk_int32_t) DUK__RE_BUFLEN(re_ctx) - atom_start_offset;
463 
464 				/* insert the required matches (qmin) by copying the atom */
465 				tmp_qmin = re_ctx->curr_token.qmin;
466 				tmp_qmax = re_ctx->curr_token.qmax;
467 				while (tmp_qmin > 0) {
468 					duk__append_slice(re_ctx, atom_start_offset, atom_code_length);
469 					tmp_qmin--;
470 					if (tmp_qmax != DUK_RE_QUANTIFIER_INFINITE) {
471 						tmp_qmax--;
472 					}
473 				}
474 				DUK_ASSERT(tmp_qmin == 0);
475 
476 				/* insert code for matching the remainder - infinite or finite */
477 				if (tmp_qmax == DUK_RE_QUANTIFIER_INFINITE) {
478 					/* reuse last emitted atom for remaining 'infinite' quantifier */
479 
480 					if (re_ctx->curr_token.qmin == 0) {
481 						/* Special case: original qmin was zero so there is nothing
482 						 * to repeat.  Emit an atom copy but jump over it here.
483 						 */
484 						duk__append_u32(re_ctx, DUK_REOP_JUMP);
485 						duk__append_jump_offset(re_ctx, atom_code_length);
486 						duk__append_slice(re_ctx, atom_start_offset, atom_code_length);
487 					}
488 					if (re_ctx->curr_token.greedy) {
489 						duk__append_u32(re_ctx, DUK_REOP_SPLIT2);   /* prefer jump */
490 					} else {
491 						duk__append_u32(re_ctx, DUK_REOP_SPLIT1);   /* prefer direct */
492 					}
493 					duk__append_jump_offset(re_ctx, -atom_code_length - 1);  /* -1 for opcode */
494 				} else {
495 					/*
496 					 *  The remaining matches are emitted as sequence of SPLITs and atom
497 					 *  copies; the SPLITs skip the remaining copies and match the sequel.
498 					 *  This sequence needs to be emitted starting from the last copy
499 					 *  because the SPLITs are variable length due to the variable length
500 					 *  skip offset.  This causes a lot of memory copying now.
501 					 *
502 					 *  Example structure (greedy, match maximum # atoms):
503 					 *
504 					 *      SPLIT1 LSEQ
505 					 *      (atom)
506 					 *      SPLIT1 LSEQ    ; <- the byte length of this instruction is needed
507 					 *      (atom)         ; to encode the above SPLIT1 correctly
508 					 *      ...
509 					 *   LSEQ:
510 					 */
511 					duk_uint32_t offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
512 					while (tmp_qmax > 0) {
513 						duk__insert_slice(re_ctx, offset, atom_start_offset, atom_code_length);
514 						if (re_ctx->curr_token.greedy) {
515 							duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT1);   /* prefer direct */
516 						} else {
517 							duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT2);   /* prefer jump */
518 						}
519 						duk__insert_jump_offset(re_ctx,
520 						                        offset + 1,   /* +1 for opcode */
521 						                        (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
522 						tmp_qmax--;
523 					}
524 				}
525 
526 				/* remove the original 'template' atom */
527 				duk__remove_slice(re_ctx, atom_start_offset, atom_code_length);
528 			}
529 
530 			/* 'taint' result as complex */
531 			res_charlen = -1;
532 			break;
533 		}
534 		case DUK_RETOK_ASSERT_START: {
535 			duk__append_u32(re_ctx, DUK_REOP_ASSERT_START);
536 			break;
537 		}
538 		case DUK_RETOK_ASSERT_END: {
539 			duk__append_u32(re_ctx, DUK_REOP_ASSERT_END);
540 			break;
541 		}
542 		case DUK_RETOK_ASSERT_WORD_BOUNDARY: {
543 			duk__append_u32(re_ctx, DUK_REOP_ASSERT_WORD_BOUNDARY);
544 			break;
545 		}
546 		case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY: {
547 			duk__append_u32(re_ctx, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
548 			break;
549 		}
550 		case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD:
551 		case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD: {
552 			duk_uint32_t offset;
553 			duk_uint32_t opcode = (re_ctx->curr_token.t == DUK_RETOK_ASSERT_START_POS_LOOKAHEAD) ?
554 			                      DUK_REOP_LOOKPOS : DUK_REOP_LOOKNEG;
555 
556 			offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
557 			duk__parse_disjunction(re_ctx, 0, &tmp_disj);
558 			duk__append_u32(re_ctx, DUK_REOP_MATCH);
559 
560 			(void) duk__insert_u32(re_ctx, offset, opcode);
561 			(void) duk__insert_jump_offset(re_ctx,
562 			                               offset + 1,   /* +1 for opcode */
563 			                               (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
564 
565 			/* 'taint' result as complex -- this is conservative,
566 			 * as lookaheads do not backtrack.
567 			 */
568 			res_charlen = -1;
569 			break;
570 		}
571 		case DUK_RETOK_ATOM_PERIOD: {
572 			new_atom_char_length = 1;
573 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
574 			duk__append_u32(re_ctx, DUK_REOP_PERIOD);
575 			break;
576 		}
577 		case DUK_RETOK_ATOM_CHAR: {
578 			/* Note: successive characters could be joined into string matches
579 			 * but this is not trivial (consider e.g. '/xyz+/); see docs for
580 			 * more discussion.
581 			 */
582 			duk_uint32_t ch;
583 
584 			new_atom_char_length = 1;
585 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
586 			duk__append_u32(re_ctx, DUK_REOP_CHAR);
587 			ch = re_ctx->curr_token.num;
588 			if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
589 				ch = duk_unicode_re_canonicalize_char(re_ctx->thr, ch);
590 			}
591 			duk__append_u32(re_ctx, ch);
592 			break;
593 		}
594 		case DUK_RETOK_ATOM_DIGIT:
595 		case DUK_RETOK_ATOM_NOT_DIGIT: {
596 			new_atom_char_length = 1;
597 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
598 			duk__append_u32(re_ctx,
599 			                (re_ctx->curr_token.t == DUK_RETOK_ATOM_DIGIT) ?
600 			                DUK_REOP_RANGES : DUK_REOP_INVRANGES);
601 			duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_digit) / (2 * sizeof(duk_uint16_t)));
602 			duk__append_u16_list(re_ctx, duk_unicode_re_ranges_digit, sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t));
603 			break;
604 		}
605 		case DUK_RETOK_ATOM_WHITE:
606 		case DUK_RETOK_ATOM_NOT_WHITE: {
607 			new_atom_char_length = 1;
608 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
609 			duk__append_u32(re_ctx,
610 			                (re_ctx->curr_token.t == DUK_RETOK_ATOM_WHITE) ?
611 			                DUK_REOP_RANGES : DUK_REOP_INVRANGES);
612 			duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_white) / (2 * sizeof(duk_uint16_t)));
613 			duk__append_u16_list(re_ctx, duk_unicode_re_ranges_white, sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t));
614 			break;
615 		}
616 		case DUK_RETOK_ATOM_WORD_CHAR:
617 		case DUK_RETOK_ATOM_NOT_WORD_CHAR: {
618 			new_atom_char_length = 1;
619 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
620 			duk__append_u32(re_ctx,
621 			                (re_ctx->curr_token.t == DUK_RETOK_ATOM_WORD_CHAR) ?
622 			                DUK_REOP_RANGES : DUK_REOP_INVRANGES);
623 			duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_wordchar) / (2 * sizeof(duk_uint16_t)));
624 			duk__append_u16_list(re_ctx, duk_unicode_re_ranges_wordchar, sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t));
625 			break;
626 		}
627 		case DUK_RETOK_ATOM_BACKREFERENCE: {
628 			duk_uint32_t backref = (duk_uint32_t) re_ctx->curr_token.num;
629 			if (backref > re_ctx->highest_backref) {
630 				re_ctx->highest_backref = backref;
631 			}
632 			new_atom_char_length = -1;   /* mark as complex */
633 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
634 			duk__append_u32(re_ctx, DUK_REOP_BACKREFERENCE);
635 			duk__append_u32(re_ctx, backref);
636 			break;
637 		}
638 		case DUK_RETOK_ATOM_START_CAPTURE_GROUP: {
639 			duk_uint32_t cap;
640 
641 			new_atom_char_length = -1;   /* mark as complex (capture handling) */
642 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
643 			cap = ++re_ctx->captures;
644 			duk__append_u32(re_ctx, DUK_REOP_SAVE);
645 			duk__append_u32(re_ctx, cap * 2);
646 			duk__parse_disjunction(re_ctx, 0, &tmp_disj);  /* retval (sub-atom char length) unused, tainted as complex above */
647 			duk__append_u32(re_ctx, DUK_REOP_SAVE);
648 			duk__append_u32(re_ctx, cap * 2 + 1);
649 			break;
650 		}
651 		case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP: {
652 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
653 			duk__parse_disjunction(re_ctx, 0, &tmp_disj);
654 			new_atom_char_length = tmp_disj.charlen;
655 			break;
656 		}
657 		case DUK_RETOK_ATOM_START_CHARCLASS:
658 		case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED: {
659 			/*
660 			 *  Range parsing is done with a special lexer function which calls
661 			 *  us for every range parsed.  This is different from how rest of
662 			 *  the parsing works, but avoids a heavy, arbitrary size intermediate
663 			 *  value type to hold the ranges.
664 			 *
665 			 *  Another complication is the handling of character ranges when
666 			 *  case insensitive matching is used (see docs for discussion).
667 			 *  The range handler callback given to the lexer takes care of this
668 			 *  as well.
669 			 *
670 			 *  Note that duplicate ranges are not eliminated when parsing character
671 			 *  classes, so that canonicalization of
672 			 *
673 			 *    [0-9a-fA-Fx-{]
674 			 *
675 			 *  creates the result (note the duplicate ranges):
676 			 *
677 			 *    [0-9A-FA-FX-Z{-{]
678 			 *
679 			 *  where [x-{] is split as a result of canonicalization.  The duplicate
680 			 *  ranges are not a semantics issue: they work correctly.
681 			 */
682 
683 			duk_uint32_t offset;
684 
685 			DUK_DD(DUK_DDPRINT("character class"));
686 
687 			/* insert ranges instruction, range count patched in later */
688 			new_atom_char_length = 1;
689 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
690 			duk__append_u32(re_ctx,
691 			                (re_ctx->curr_token.t == DUK_RETOK_ATOM_START_CHARCLASS) ?
692 			                DUK_REOP_RANGES : DUK_REOP_INVRANGES);
693 			offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);    /* patch in range count later */
694 
695 			/* parse ranges until character class ends */
696 			re_ctx->nranges = 0;    /* note: ctx-wide temporary */
697 			duk_lexer_parse_re_ranges(&re_ctx->lex, duk__generate_ranges, (void *) re_ctx);
698 
699 			/* insert range count */
700 			duk__insert_u32(re_ctx, offset, re_ctx->nranges);
701 			break;
702 		}
703 		case DUK_RETOK_ATOM_END_GROUP: {
704 			if (expect_eof) {
705 				DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_CLOSING_PAREN);
706 			}
707 			goto done;
708 		}
709 		case DUK_RETOK_EOF: {
710 			if (!expect_eof) {
711 				DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_END_OF_PATTERN);
712 			}
713 			goto done;
714 		}
715 		default: {
716 			DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_REGEXP_TOKEN);
717 		}
718 		}
719 
720 		/* a complex (new) atom taints the result */
721 		if (new_atom_start_offset >= 0) {
722 			if (new_atom_char_length < 0) {
723 				res_charlen = -1;
724 			} else if (res_charlen >= 0) {
725 				/* only advance if not tainted */
726 				res_charlen += new_atom_char_length;
727 			}
728 		}
729 
730 		/* record previous atom info in case next token is a quantifier */
731 		atom_start_offset = new_atom_start_offset;
732 		atom_char_length = new_atom_char_length;
733 		atom_start_captures = new_atom_start_captures;
734 	}
735 
736  done:
737 
738 	/* finish up pending jump and split for last alternative */
739 	if (unpatched_disjunction_jump >= 0) {
740 		duk_uint32_t offset;
741 
742 		DUK_ASSERT(unpatched_disjunction_split >= 0);
743 		offset = unpatched_disjunction_jump;
744 		offset += duk__insert_jump_offset(re_ctx,
745 		                                  offset,
746 		                                  (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
747 		/* offset is now target of the pending split (right after jump) */
748 		duk__insert_jump_offset(re_ctx,
749 		                        unpatched_disjunction_split,
750 		                        offset - unpatched_disjunction_split);
751 	}
752 
753 #if 0
754 	out_atom_info->end_captures = re_ctx->captures;
755 #endif
756 	out_atom_info->charlen = res_charlen;
757 	DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld",
758 	                     (long) out_atom_info->charlen));
759 
760 	re_ctx->recursion_depth--;
761 }
762 
763 /*
764  *  Flags parsing (see E5 Section 15.10.4.1).
765  */
766 
duk__parse_regexp_flags(duk_hthread * thr,duk_hstring * h)767 DUK_LOCAL duk_uint32_t duk__parse_regexp_flags(duk_hthread *thr, duk_hstring *h) {
768 	const duk_uint8_t *p;
769 	const duk_uint8_t *p_end;
770 	duk_uint32_t flags = 0;
771 
772 	p = DUK_HSTRING_GET_DATA(h);
773 	p_end = p + DUK_HSTRING_GET_BYTELEN(h);
774 
775 	/* Note: can be safely scanned as bytes (undecoded) */
776 
777 	while (p < p_end) {
778 		duk_uint8_t c = *p++;
779 		switch ((int) c) {
780 		case (int) 'g': {
781 			if (flags & DUK_RE_FLAG_GLOBAL) {
782 				goto error;
783 			}
784 			flags |= DUK_RE_FLAG_GLOBAL;
785 			break;
786 		}
787 		case (int) 'i': {
788 			if (flags & DUK_RE_FLAG_IGNORE_CASE) {
789 				goto error;
790 			}
791 			flags |= DUK_RE_FLAG_IGNORE_CASE;
792 			break;
793 		}
794 		case (int) 'm': {
795 			if (flags & DUK_RE_FLAG_MULTILINE) {
796 				goto error;
797 			}
798 			flags |= DUK_RE_FLAG_MULTILINE;
799 			break;
800 		}
801 		default: {
802 			goto error;
803 		}
804 		}
805 	}
806 
807 	return flags;
808 
809  error:
810 	DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_REGEXP_FLAGS);
811 	return 0;  /* never here */
812 }
813 
814 /*
815  *  Create escaped RegExp source (E5 Section 15.10.3).
816  *
817  *  The current approach is to special case the empty RegExp
818  *  ('' -> '(?:)') and otherwise replace unescaped '/' characters
819  *  with '\/' regardless of where they occur in the regexp.
820  *
821  *  Note that normalization does not seem to be necessary for
822  *  RegExp literals (e.g. '/foo/') because to be acceptable as
823  *  a RegExp literal, the text between forward slashes must
824  *  already match the escaping requirements (e.g. must not contain
825  *  unescaped forward slashes or be empty).  Escaping IS needed
826  *  for expressions like 'new Regexp("...", "")' however.
827  *  Currently, we re-escape in either case.
828  *
829  *  Also note that we process the source here in UTF-8 encoded
830  *  form.  This is correct, because any non-ASCII characters are
831  *  passed through without change.
832  */
833 
duk__create_escaped_source(duk_hthread * thr,int idx_pattern)834 DUK_LOCAL void duk__create_escaped_source(duk_hthread *thr, int idx_pattern) {
835 	duk_context *ctx = (duk_context *) thr;
836 	duk_hstring *h;
837 	const duk_uint8_t *p;
838 	duk_bufwriter_ctx bw_alloc;
839 	duk_bufwriter_ctx *bw;
840 	duk_uint8_t *q;
841 	duk_size_t i, n;
842 	duk_uint_fast8_t c_prev, c;
843 
844 	h = duk_get_hstring(ctx, idx_pattern);
845 	DUK_ASSERT(h != NULL);
846 	p = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);
847 	n = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h);
848 
849 	if (n == 0) {
850 		/* return '(?:)' */
851 		duk_push_hstring_stridx(ctx, DUK_STRIDX_ESCAPED_EMPTY_REGEXP);
852 		return;
853 	}
854 
855 	bw = &bw_alloc;
856 	DUK_BW_INIT_PUSHBUF(thr, bw, n);
857 	q = DUK_BW_GET_PTR(thr, bw);
858 
859 	c_prev = (duk_uint_fast8_t) 0;
860 
861 	for (i = 0; i < n; i++) {
862 		c = p[i];
863 
864 		q = DUK_BW_ENSURE_RAW(thr, bw, 2, q);
865 
866 		if (c == (duk_uint_fast8_t) '/' && c_prev != (duk_uint_fast8_t) '\\') {
867 			/* Unescaped '/' ANYWHERE in the regexp (in disjunction,
868 			 * inside a character class, ...) => same escape works.
869 			 */
870 			*q++ = DUK_ASC_BACKSLASH;
871 		}
872 		*q++ = (duk_uint8_t) c;
873 
874 		c_prev = c;
875 	}
876 
877 	DUK_BW_SETPTR_AND_COMPACT(thr, bw, q);
878 	duk_to_string(ctx, -1);  /* -> [ ... escaped_source ] */
879 }
880 
881 /*
882  *  Exposed regexp compilation primitive.
883  *
884  *  Sets up a regexp compilation context, and calls duk__parse_disjunction() to do the
885  *  actual parsing.  Handles generation of the compiled regexp header and the
886  *  "boilerplate" capture of the matching substring (save 0 and 1).  Also does some
887  *  global level regexp checks after recursive compilation has finished.
888  *
889  *  An escaped version of the regexp source, suitable for use as a RegExp instance
890  *  'source' property (see E5 Section 15.10.3), is also left on the stack.
891  *
892  *  Input stack:  [ pattern flags ]
893  *  Output stack: [ bytecode escaped_source ]  (both as strings)
894  */
895 
duk_regexp_compile(duk_hthread * thr)896 DUK_INTERNAL void duk_regexp_compile(duk_hthread *thr) {
897 	duk_context *ctx = (duk_context *) thr;
898 	duk_re_compiler_ctx re_ctx;
899 	duk_lexer_point lex_point;
900 	duk_hstring *h_pattern;
901 	duk_hstring *h_flags;
902 	duk__re_disjunction_info ign_disj;
903 
904 	DUK_ASSERT(thr != NULL);
905 	DUK_ASSERT(ctx != NULL);
906 
907 	/*
908 	 *  Args validation
909 	 */
910 
911 	/* TypeError if fails */
912 	h_pattern = duk_require_hstring(ctx, -2);
913 	h_flags = duk_require_hstring(ctx, -1);
914 
915 	/*
916 	 *  Create normalized 'source' property (E5 Section 15.10.3).
917 	 */
918 
919 	/* [ ... pattern flags ] */
920 
921 	duk__create_escaped_source(thr, -2);
922 
923 	/* [ ... pattern flags escaped_source ] */
924 
925 	/*
926 	 *  Init compilation context
927 	 */
928 
929 	/* [ ... pattern flags escaped_source buffer ] */
930 
931 	DUK_MEMZERO(&re_ctx, sizeof(re_ctx));
932 	DUK_LEXER_INITCTX(&re_ctx.lex);  /* duplicate zeroing, expect for (possible) NULL inits */
933 	re_ctx.thr = thr;
934 	re_ctx.lex.thr = thr;
935 	re_ctx.lex.input = DUK_HSTRING_GET_DATA(h_pattern);
936 	re_ctx.lex.input_length = DUK_HSTRING_GET_BYTELEN(h_pattern);
937 	re_ctx.lex.token_limit = DUK_RE_COMPILE_TOKEN_LIMIT;
938 	re_ctx.recursion_limit = DUK_USE_REGEXP_COMPILER_RECLIMIT;
939 	re_ctx.re_flags = duk__parse_regexp_flags(thr, h_flags);
940 
941 	DUK_BW_INIT_PUSHBUF(thr, &re_ctx.bw, DUK__RE_INITIAL_BUFSIZE);
942 
943 	DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld",
944 	                   (unsigned long) re_ctx.re_flags, (long) re_ctx.recursion_limit));
945 
946 	/*
947 	 *  Init lexer
948 	 */
949 
950 	lex_point.offset = 0;  /* expensive init, just want to fill window */
951 	lex_point.line = 1;
952 	DUK_LEXER_SETPOINT(&re_ctx.lex, &lex_point);
953 
954 	/*
955 	 *  Compilation
956 	 */
957 
958 	DUK_DD(DUK_DDPRINT("starting regexp compilation"));
959 
960 	duk__append_u32(&re_ctx, DUK_REOP_SAVE);
961 	duk__append_u32(&re_ctx, 0);
962 	duk__parse_disjunction(&re_ctx, 1 /*expect_eof*/, &ign_disj);
963 	duk__append_u32(&re_ctx, DUK_REOP_SAVE);
964 	duk__append_u32(&re_ctx, 1);
965 	duk__append_u32(&re_ctx, DUK_REOP_MATCH);
966 
967 	/*
968 	 *  Check for invalid backreferences; note that it is NOT an error
969 	 *  to back-reference a capture group which has not yet been introduced
970 	 *  in the pattern (as in /\1(foo)/); in fact, the backreference will
971 	 *  always match!  It IS an error to back-reference a capture group
972 	 *  which will never be introduced in the pattern.  Thus, we can check
973 	 *  for such references only after parsing is complete.
974 	 */
975 
976 	if (re_ctx.highest_backref > re_ctx.captures) {
977 		DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_BACKREFS);
978 	}
979 
980 	/*
981 	 *  Emit compiled regexp header: flags, ncaptures
982 	 *  (insertion order inverted on purpose)
983 	 */
984 
985 	duk__insert_u32(&re_ctx, 0, (re_ctx.captures + 1) * 2);
986 	duk__insert_u32(&re_ctx, 0, re_ctx.re_flags);
987 
988 	/* [ ... pattern flags escaped_source buffer ] */
989 
990 	DUK_BW_COMPACT(thr, &re_ctx.bw);
991 	duk_to_string(ctx, -1);  /* coerce to string */
992 
993 	/* [ ... pattern flags escaped_source bytecode ] */
994 
995 	/*
996 	 *  Finalize stack
997 	 */
998 
999 	duk_remove(ctx, -4);     /* -> [ ... flags escaped_source bytecode ] */
1000 	duk_remove(ctx, -3);     /* -> [ ... escaped_source bytecode ] */
1001 
1002 	DUK_DD(DUK_DDPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T",
1003 	                   (duk_tval *) duk_get_tval(ctx, -1), (duk_tval *) duk_get_tval(ctx, -2)));
1004 }
1005 
1006 /*
1007  *  Create a RegExp instance (E5 Section 15.10.7).
1008  *
1009  *  Note: the output stack left by duk_regexp_compile() is directly compatible
1010  *  with the input here.
1011  *
1012  *  Input stack:  [ escaped_source bytecode ]  (both as strings)
1013  *  Output stack: [ RegExp ]
1014  */
1015 
duk_regexp_create_instance(duk_hthread * thr)1016 DUK_INTERNAL void duk_regexp_create_instance(duk_hthread *thr) {
1017 	duk_context *ctx = (duk_context *) thr;
1018 	duk_hobject *h;
1019 	duk_hstring *h_bc;
1020 	duk_small_int_t re_flags;
1021 
1022 	/* [ ... escape_source bytecode ] */
1023 
1024 	h_bc = duk_get_hstring(ctx, -1);
1025 	DUK_ASSERT(h_bc != NULL);
1026 	DUK_ASSERT(DUK_HSTRING_GET_BYTELEN(h_bc) >= 1);          /* always at least the header */
1027 	DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h_bc) >= 1);
1028 	DUK_ASSERT((duk_small_int_t) DUK_HSTRING_GET_DATA(h_bc)[0] < 0x80);  /* flags always encodes to 1 byte */
1029 	re_flags = (duk_small_int_t) DUK_HSTRING_GET_DATA(h_bc)[0];
1030 
1031 	/* [ ... escaped_source bytecode ] */
1032 
1033 	duk_push_object(ctx);
1034 	h = duk_get_hobject(ctx, -1);
1035 	DUK_ASSERT(h != NULL);
1036 	duk_insert(ctx, -3);
1037 
1038 	/* [ ... regexp_object escaped_source bytecode ] */
1039 
1040 	DUK_HOBJECT_SET_CLASS_NUMBER(h, DUK_HOBJECT_CLASS_REGEXP);
1041 	DUK_HOBJECT_SET_PROTOTYPE_UPDREF(thr, h, thr->builtins[DUK_BIDX_REGEXP_PROTOTYPE]);
1042 
1043 	duk_xdef_prop_stridx(ctx, -3, DUK_STRIDX_INT_BYTECODE, DUK_PROPDESC_FLAGS_NONE);
1044 
1045 	/* [ ... regexp_object escaped_source ] */
1046 
1047 	duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_SOURCE, DUK_PROPDESC_FLAGS_NONE);
1048 
1049 	/* [ ... regexp_object ] */
1050 
1051 	duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_GLOBAL));
1052 	duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_GLOBAL, DUK_PROPDESC_FLAGS_NONE);
1053 
1054 	duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_IGNORE_CASE));
1055 	duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_IGNORE_CASE, DUK_PROPDESC_FLAGS_NONE);
1056 
1057 	duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_MULTILINE));
1058 	duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_MULTILINE, DUK_PROPDESC_FLAGS_NONE);
1059 
1060 	duk_push_int(ctx, 0);
1061 	duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_LAST_INDEX, DUK_PROPDESC_FLAGS_W);
1062 
1063 	/* [ ... regexp_object ] */
1064 }
1065 
1066 #undef DUK__RE_BUFLEN
1067 
1068 #else  /* DUK_USE_REGEXP_SUPPORT */
1069 
1070 /* regexp support disabled */
1071 
1072 #endif  /* DUK_USE_REGEXP_SUPPORT */
1073