ceph/src/jaegertracing/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/civetweb/src/third_party/duktape-1.8.0/src-separate/duk_regexp_compiler.c

   1 /*
   2  *  Regexp compilation.
   3  *
   4  *  See doc/regexp.rst for a discussion of the compilation approach and
   5  *  current limitations.
   6  *
   7  *  Regexp bytecode assumes jumps can be expressed with signed 32-bit
   8  *  integers.  Consequently the bytecode size must not exceed 0x7fffffffL.
   9  *  The implementation casts duk_size_t (buffer size) to duk_(u)int32_t
  10  *  in many places.  Although this could be changed, the bytecode format
  11  *  limit would still prevent regexps exceeding the signed 32-bit limit
  12  *  from working.
  13  *
  14  *  XXX: The implementation does not prevent bytecode from exceeding the
  15  *  maximum supported size.  This could be done by limiting the maximum
  16  *  input string size (assuming an upper bound can be computed for number
  17  *  of bytecode bytes emitted per input byte) or checking buffer maximum
  18  *  size when emitting bytecode (slower).
  19  */
  20
  21 #include "duk_internal.h"
  22
  23 #ifdef DUK_USE_REGEXP_SUPPORT
  24
  25 /*
  26  *  Helper macros
  27  */
  28
  29 #define DUK__RE_INITIAL_BUFSIZE 64
  30
  31 #undef DUK__RE_BUFLEN
  32 #define DUK__RE_BUFLEN(re_ctx) \
  33         DUK_BW_GET_SIZE(re_ctx->thr, &re_ctx->bw)
  34
  35 /*
  36  *  Disjunction struct: result of parsing a disjunction
  37  */
  38
  39 typedef struct {
  40         /* Number of characters that the atom matches (e.g. 3 for 'abc'),
  41          * -1 if atom is complex and number of matched characters either
  42          * varies or is not known.
  43          */
  44         duk_int32_t charlen;
  45
  46 #if 0
  47         /* These are not needed to implement quantifier capture handling,
  48          * but might be needed at some point.
  49          */
  50
  51         /* re_ctx->captures at start and end of atom parsing.
  52          * Since 'captures' indicates highest capture number emitted
  53          * so far in a DUK_REOP_SAVE, the captures numbers saved by
  54          * the atom are: ]start_captures,end_captures].
  55          */
  56         duk_uint32_t start_captures;
  57         duk_uint32_t end_captures;
  58 #endif
  59 } duk__re_disjunction_info;
  60
  61 /*
  62  *  Encoding helpers
  63  *
  64  *  Some of the typing is bytecode based, e.g. slice sizes are unsigned 32-bit
  65  *  even though the buffer operations will use duk_size_t.
  66  */
  67
  68 /* XXX: the insert helpers should ensure that the bytecode result is not
  69  * larger than expected (or at least assert for it).  Many things in the
  70  * bytecode, like skip offsets, won't work correctly if the bytecode is
  71  * larger than say 2G.
  72  */
  73
  74 DUK_LOCAL duk_uint32_t duk__encode_i32(duk_int32_t x) {
  75         if (x < 0) {
  76                 return ((duk_uint32_t) (-x)) * 2 + 1;
  77         } else {
  78                 return ((duk_uint32_t) x) * 2;
  79         }
  80 }
  81
  82 /* XXX: return type should probably be duk_size_t, or explicit checks are needed for
  83  * maximum size.
  84  */
  85 DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) {
  86         duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
  87         duk_small_int_t len;
  88
  89         len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
  90         DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, len);
  91         return (duk_uint32_t) len;
  92 }
  93
  94 DUK_LOCAL duk_uint32_t duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
  95         duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
  96         duk_small_int_t len;
  97
  98         len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
  99         DUK_BW_WRITE_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, buf, len);
 100         return (duk_uint32_t) len;
 101 }
 102
 103 DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) {
 104         return duk__insert_u32(re_ctx, offset, duk__encode_i32(x));
 105 }
 106
 107 #if 0  /* unused */
 108 DUK_LOCAL duk_uint32_t duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) {
 109         return duk__append_u32(re_ctx, duk__encode_i32(x));
 110 }
 111 #endif
 112
 113 /* special helper for emitting u16 lists (used for character ranges for built-in char classes) */
 114 DUK_LOCAL void duk__append_u16_list(duk_re_compiler_ctx *re_ctx, const duk_uint16_t *values, duk_uint32_t count) {
 115         /* Call sites don't need the result length so it's not accumulated. */
 116         while (count > 0) {
 117                 (void) duk__append_u32(re_ctx, (duk_uint32_t) (*values++));
 118                 count--;
 119         }
 120 }
 121
 122 DUK_LOCAL void duk__insert_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t data_offset, duk_uint32_t data_length) {
 123         DUK_BW_INSERT_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, offset, data_offset, data_length);
 124 }
 125
 126 DUK_LOCAL void duk__append_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
 127         DUK_BW_WRITE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
 128 }
 129
 130 DUK_LOCAL void duk__remove_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
 131         DUK_BW_REMOVE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
 132 }
 133
 134 /*
 135  *  Insert a jump offset at 'offset' to complete an instruction
 136  *  (the jump offset is always the last component of an instruction).
 137  *  The 'skip' argument must be computed relative to 'offset',
 138  *  -without- taking into account the skip field being inserted.
 139  *
 140  *       ... A B C ins X Y Z ...   (ins may be a JUMP, SPLIT1/SPLIT2, etc)
 141  *   =>  ... A B C ins SKIP X Y Z
 142  *
 143  *  Computing the final (adjusted) skip value, which is relative to the
 144  *  first byte of the next instruction, is a bit tricky because of the
 145  *  variable length UTF-8 encoding.  See doc/regexp.rst for discussion.
 146  */
 147 DUK_LOCAL duk_uint32_t duk__insert_jump_offset(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t skip) {
 148         duk_small_int_t len;
 149
 150         /* XXX: solve into closed form (smaller code) */
 151
 152         if (skip < 0) {
 153                 /* two encoding attempts suffices */
 154                 len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip));
 155                 len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip - (duk_int32_t) len));
 156                 DUK_ASSERT(duk_unicode_get_xutf8_length(duk__encode_i32(skip - (duk_int32_t) len)) == len);  /* no change */
 157                 skip -= (duk_int32_t) len;
 158         }
 159         return duk__insert_i32(re_ctx, offset, skip);
 160 }
 161
 162 DUK_LOCAL duk_uint32_t duk__append_jump_offset(duk_re_compiler_ctx *re_ctx, duk_int32_t skip) {
 163         return (duk_uint32_t) duk__insert_jump_offset(re_ctx, (duk_uint32_t) DUK__RE_BUFLEN(re_ctx), skip);
 164 }
 165
 166 /*
 167  *  duk_re_range_callback for generating character class ranges.
 168  *
 169  *  When ignoreCase is false, the range is simply emitted as is.
 170  *  We don't, for instance, eliminate duplicates or overlapping
 171  *  ranges in a character class.
 172  *
 173  *  When ignoreCase is true, the range needs to be normalized through
 174  *  canonicalization.  Unfortunately a canonicalized version of a
 175  *  continuous range is not necessarily continuous (e.g. [x-{] is
 176  *  continuous but [X-{] is not).  The current algorithm creates the
 177  *  canonicalized range(s) space efficiently at the cost of compile
 178  *  time execution time (see doc/regexp.rst for discussion).
 179  *
 180  *  Note that the ctx->nranges is a context-wide temporary value
 181  *  (this is OK because there cannot be multiple character classes
 182  *  being parsed simultaneously).
 183  */
 184
 185 DUK_LOCAL void duk__generate_ranges(void *userdata, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct) {
 186         duk_re_compiler_ctx *re_ctx = (duk_re_compiler_ctx *) userdata;
 187
 188         DUK_DD(DUK_DDPRINT("duk__generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld",
 189                            (void *) re_ctx, (long) r1, (long) r2, (long) direct));
 190
 191         if (!direct && (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE)) {
 192                 /*
 193                  *  Canonicalize a range, generating result ranges as necessary.
 194                  *  Needs to exhaustively scan the entire range (at most 65536
 195                  *  code points).  If 'direct' is set, caller (lexer) has ensured
 196                  *  that the range is already canonicalization compatible (this
 197                  *  is used to avoid unnecessary canonicalization of built-in
 198                  *  ranges like \W, which are not affected by canonicalization).
 199                  *
 200                  *  NOTE: here is one place where we don't want to support chars
 201                  *  outside the BMP, because the exhaustive search would be
 202                  *  massively larger.
 203                  */
 204
 205                 duk_codepoint_t i;
 206                 duk_codepoint_t t;
 207                 duk_codepoint_t r_start, r_end;
 208
 209                 r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
 210                 r_end = r_start;
 211                 for (i = r1 + 1; i <= r2; i++) {
 212                         t = duk_unicode_re_canonicalize_char(re_ctx->thr, i);
 213                         if (t == r_end + 1) {
 214                                 r_end = t;
 215                         } else {
 216                                 DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
 217                                 duk__append_u32(re_ctx, (duk_uint32_t) r_start);
 218                                 duk__append_u32(re_ctx, (duk_uint32_t) r_end);
 219                                 re_ctx->nranges++;
 220                                 r_start = t;
 221                                 r_end = t;
 222                         }
 223                 }
 224                 DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
 225                 duk__append_u32(re_ctx, (duk_uint32_t) r_start);
 226                 duk__append_u32(re_ctx, (duk_uint32_t) r_end);
 227                 re_ctx->nranges++;
 228         } else {
 229                 DUK_DD(DUK_DDPRINT("direct, emit range: [%ld,%ld]", (long) r1, (long) r2));
 230                 duk__append_u32(re_ctx, (duk_uint32_t) r1);
 231                 duk__append_u32(re_ctx, (duk_uint32_t) r2);
 232                 re_ctx->nranges++;
 233         }
 234 }
 235
 236 /*
 237  *  Parse regexp Disjunction.  Most of regexp compilation happens here.
 238  *
 239  *  Handles Disjunction, Alternative, and Term productions directly without
 240  *  recursion.  The only constructs requiring recursion are positive/negative
 241  *  lookaheads, capturing parentheses, and non-capturing parentheses.
 242  *
 243  *  The function determines whether the entire disjunction is a 'simple atom'
 244  *  (see doc/regexp.rst discussion on 'simple quantifiers') and if so,
 245  *  returns the atom character length which is needed by the caller to keep
 246  *  track of its own atom character length.  A disjunction with more than one
 247  *  alternative is never considered a simple atom (although in some cases
 248  *  that might be the case).
 249  *
 250  *  Return value: simple atom character length or < 0 if not a simple atom.
 251  *  Appends the bytecode for the disjunction matcher to the end of the temp
 252  *  buffer.
 253  *
 254  *  Regexp top level structure is:
 255  *
 256  *    Disjunction = Term*
 257  *                | Term* | Disjunction
 258  *
 259  *    Term = Assertion
 260  *         | Atom
 261  *         | Atom Quantifier
 262  *
 263  *  An empty Term sequence is a valid disjunction alternative (e.g. /|||c||/).
 264  *
 265  *  Notes:
 266  *
 267  *    * Tracking of the 'simple-ness' of the current atom vs. the entire
 268  *      disjunction are separate matters.  For instance, the disjunction
 269  *      may be complex, but individual atoms may be simple.  Furthermore,
 270  *      simple quantifiers are used whenever possible, even if the
 271  *      disjunction as a whole is complex.
 272  *
 273  *    * The estimate of whether an atom is simple is conservative now,
 274  *      and it would be possible to expand it.  For instance, captures
 275  *      cause the disjunction to be marked complex, even though captures
 276  *      -can- be handled by simple quantifiers with some minor modifications.
 277  *
 278  *    * Disjunction 'tainting' as 'complex' is handled at the end of the
 279  *      main for loop collectively for atoms.  Assertions, quantifiers,
 280  *      and '|' tokens need to taint the result manually if necessary.
 281  *      Assertions cannot add to result char length, only atoms (and
 282  *      quantifiers) can; currently quantifiers will taint the result
 283  *      as complex though.
 284  */
 285
 286 DUK_LOCAL void duk__parse_disjunction(duk_re_compiler_ctx *re_ctx, duk_bool_t expect_eof, duk__re_disjunction_info *out_atom_info) {
 287         duk_int32_t atom_start_offset = -1;                   /* negative -> no atom matched on previous round */
 288         duk_int32_t atom_char_length = 0;                     /* negative -> complex atom */
 289         duk_uint32_t atom_start_captures = re_ctx->captures;  /* value of re_ctx->captures at start of atom */
 290         duk_int32_t unpatched_disjunction_split = -1;
 291         duk_int32_t unpatched_disjunction_jump = -1;
 292         duk_uint32_t entry_offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
 293         duk_int32_t res_charlen = 0;  /* -1 if disjunction is complex, char length if simple */
 294         duk__re_disjunction_info tmp_disj;
 295
 296         DUK_ASSERT(out_atom_info != NULL);
 297
 298         if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
 299                 DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT);
 300         }
 301         re_ctx->recursion_depth++;
 302
 303 #if 0
 304         out_atom_info->start_captures = re_ctx->captures;
 305 #endif
 306
 307         for (;;) {
 308                 /* atom_char_length, atom_start_offset, atom_start_offset reflect the
 309                  * atom matched on the previous loop.  If a quantifier is encountered
 310                  * on this loop, these are needed to handle the quantifier correctly.
 311                  * new_atom_char_length etc are for the atom parsed on this round;
 312                  * they're written to atom_char_length etc at the end of the round.
 313                  */
 314                 duk_int32_t new_atom_char_length;   /* char length of the atom parsed in this loop */
 315                 duk_int32_t new_atom_start_offset;  /* bytecode start offset of the atom parsed in this loop
 316                                                      * (allows quantifiers to copy the atom bytecode)
 317                                                      */
 318                 duk_uint32_t new_atom_start_captures;  /* re_ctx->captures at the start of the atom parsed in this loop */
 319
 320                 duk_lexer_parse_re_token(&re_ctx->lex, &re_ctx->curr_token);
 321
 322                 DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)",
 323                                    (long) re_ctx->curr_token.t,
 324                                    (long) re_ctx->curr_token.num,
 325                                    (re_ctx->curr_token.num >= 0x20 && re_ctx->curr_token.num <= 0x7e) ?
 326                                    (int) re_ctx->curr_token.num : (int) '?'));
 327
 328                 /* set by atom case clauses */
 329                 new_atom_start_offset = -1;
 330                 new_atom_char_length = -1;
 331                 new_atom_start_captures = re_ctx->captures;
 332
 333                 switch (re_ctx->curr_token.t) {
 334                 case DUK_RETOK_DISJUNCTION: {
 335                         /*
 336                          *  The handling here is a bit tricky.  If a previous '|' has been processed,
 337                          *  we have a pending split1 and a pending jump (for a previous match).  These
 338                          *  need to be back-patched carefully.  See docs for a detailed example.
 339                          */
 340
 341                         /* patch pending jump and split */
 342                         if (unpatched_disjunction_jump >= 0) {
 343                                 duk_uint32_t offset;
 344
 345                                 DUK_ASSERT(unpatched_disjunction_split >= 0);
 346                                 offset = unpatched_disjunction_jump;
 347                                 offset += duk__insert_jump_offset(re_ctx,
 348                                                                   offset,
 349                                                                   (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
 350                                 /* offset is now target of the pending split (right after jump) */
 351                                 duk__insert_jump_offset(re_ctx,
 352                                                         unpatched_disjunction_split,
 353                                                         offset - unpatched_disjunction_split);
 354                         }
 355
 356                         /* add a new pending split to the beginning of the entire disjunction */
 357                         (void) duk__insert_u32(re_ctx,
 358                                                entry_offset,
 359                                                DUK_REOP_SPLIT1);   /* prefer direct execution */
 360                         unpatched_disjunction_split = entry_offset + 1;   /* +1 for opcode */
 361
 362                         /* add a new pending match jump for latest finished alternative */
 363                         duk__append_u32(re_ctx, DUK_REOP_JUMP);
 364                         unpatched_disjunction_jump = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 365
 366                         /* 'taint' result as complex */
 367                         res_charlen = -1;
 368                         break;
 369                 }
 370                 case DUK_RETOK_QUANTIFIER: {
 371                         if (atom_start_offset < 0) {
 372                                 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_NO_ATOM);
 373                         }
 374                         if (re_ctx->curr_token.qmin > re_ctx->curr_token.qmax) {
 375                                 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_VALUES);
 376                         }
 377                         if (atom_char_length >= 0) {
 378                                 /*
 379                                  *  Simple atom
 380                                  *
 381                                  *  If atom_char_length is zero, we'll have unbounded execution time for e.g.
 382                                  *  /()*x/.exec('x').  We can't just skip the match because it might have some
 383                                  *  side effects (for instance, if we allowed captures in simple atoms, the
 384                                  *  capture needs to happen).  The simple solution below is to force the
 385                                  *  quantifier to match at most once, since the additional matches have no effect.
 386                                  *
 387                                  *  With a simple atom there can be no capture groups, so no captures need
 388                                  *  to be reset.
 389                                  */
 390                                 duk_int32_t atom_code_length;
 391                                 duk_uint32_t offset;
 392                                 duk_uint32_t qmin, qmax;
 393
 394                                 qmin = re_ctx->curr_token.qmin;
 395                                 qmax = re_ctx->curr_token.qmax;
 396                                 if (atom_char_length == 0) {
 397                                         /* qmin and qmax will be 0 or 1 */
 398                                         if (qmin > 1) {
 399                                                 qmin = 1;
 400                                         }
 401                                         if (qmax > 1) {
 402                                                 qmax = 1;
 403                                         }
 404                                 }
 405
 406                                 duk__append_u32(re_ctx, DUK_REOP_MATCH);   /* complete 'sub atom' */
 407                                 atom_code_length = (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - atom_start_offset);
 408
 409                                 offset = atom_start_offset;
 410                                 if (re_ctx->curr_token.greedy) {
 411                                         offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQGREEDY);
 412                                         offset += duk__insert_u32(re_ctx, offset, qmin);
 413                                         offset += duk__insert_u32(re_ctx, offset, qmax);
 414                                         offset += duk__insert_u32(re_ctx, offset, atom_char_length);
 415                                         offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
 416                                 } else {
 417                                         offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQMINIMAL);
 418                                         offset += duk__insert_u32(re_ctx, offset, qmin);
 419                                         offset += duk__insert_u32(re_ctx, offset, qmax);
 420                                         offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
 421                                 }
 422                                 DUK_UNREF(offset);  /* silence scan-build warning */
 423                         } else {
 424                                 /*
 425                                  *  Complex atom
 426                                  *
 427                                  *  The original code is used as a template, and removed at the end
 428                                  *  (this differs from the handling of simple quantifiers).
 429                                  *
 430                                  *  NOTE: there is no current solution for empty atoms in complex
 431                                  *  quantifiers.  This would need some sort of a 'progress' instruction.
 432                                  *
 433                                  *  XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies?
 434                                  */
 435                                 duk_int32_t atom_code_length;
 436                                 duk_uint32_t atom_copies;
 437                                 duk_uint32_t tmp_qmin, tmp_qmax;
 438
 439                                 /* pre-check how many atom copies we're willing to make (atom_copies not needed below) */
 440                                 atom_copies = (re_ctx->curr_token.qmax == DUK_RE_QUANTIFIER_INFINITE) ?
 441                                               re_ctx->curr_token.qmin : re_ctx->curr_token.qmax;
 442                                 if (atom_copies > DUK_RE_MAX_ATOM_COPIES) {
 443                                         DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_QUANTIFIER_TOO_MANY_COPIES);
 444                                 }
 445
 446                                 /* wipe the capture range made by the atom (if any) */
 447                                 DUK_ASSERT(atom_start_captures <= re_ctx->captures);
 448                                 if (atom_start_captures != re_ctx->captures) {
 449                                         DUK_ASSERT(atom_start_captures < re_ctx->captures);
 450                                         DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]",
 451                                                              (long) atom_start_captures, (long) re_ctx->captures));
 452
 453                                         /* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */
 454                                         duk__insert_u32(re_ctx, atom_start_offset, (re_ctx->captures - atom_start_captures) * 2);
 455                                         duk__insert_u32(re_ctx, atom_start_offset, (atom_start_captures + 1) * 2);
 456                                         duk__insert_u32(re_ctx, atom_start_offset, DUK_REOP_WIPERANGE);
 457                                 } else {
 458                                         DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld",
 459                                                              (long) atom_start_captures));
 460                                 }
 461
 462                                 atom_code_length = (duk_int32_t) DUK__RE_BUFLEN(re_ctx) - atom_start_offset;
 463
 464                                 /* insert the required matches (qmin) by copying the atom */
 465                                 tmp_qmin = re_ctx->curr_token.qmin;
 466                                 tmp_qmax = re_ctx->curr_token.qmax;
 467                                 while (tmp_qmin > 0) {
 468                                         duk__append_slice(re_ctx, atom_start_offset, atom_code_length);
 469                                         tmp_qmin--;
 470                                         if (tmp_qmax != DUK_RE_QUANTIFIER_INFINITE) {
 471                                                 tmp_qmax--;
 472                                         }
 473                                 }
 474                                 DUK_ASSERT(tmp_qmin == 0);
 475
 476                                 /* insert code for matching the remainder - infinite or finite */
 477                                 if (tmp_qmax == DUK_RE_QUANTIFIER_INFINITE) {
 478                                         /* reuse last emitted atom for remaining 'infinite' quantifier */
 479
 480                                         if (re_ctx->curr_token.qmin == 0) {
 481                                                 /* Special case: original qmin was zero so there is nothing
 482                                                  * to repeat.  Emit an atom copy but jump over it here.
 483                                                  */
 484                                                 duk__append_u32(re_ctx, DUK_REOP_JUMP);
 485                                                 duk__append_jump_offset(re_ctx, atom_code_length);
 486                                                 duk__append_slice(re_ctx, atom_start_offset, atom_code_length);
 487                                         }
 488                                         if (re_ctx->curr_token.greedy) {
 489                                                 duk__append_u32(re_ctx, DUK_REOP_SPLIT2);   /* prefer jump */
 490                                         } else {
 491                                                 duk__append_u32(re_ctx, DUK_REOP_SPLIT1);   /* prefer direct */
 492                                         }
 493                                         duk__append_jump_offset(re_ctx, -atom_code_length - 1);  /* -1 for opcode */
 494                                 } else {
 495                                         /*
 496                                          *  The remaining matches are emitted as sequence of SPLITs and atom
 497                                          *  copies; the SPLITs skip the remaining copies and match the sequel.
 498                                          *  This sequence needs to be emitted starting from the last copy
 499                                          *  because the SPLITs are variable length due to the variable length
 500                                          *  skip offset.  This causes a lot of memory copying now.
 501                                          *
 502                                          *  Example structure (greedy, match maximum # atoms):
 503                                          *
 504                                          *      SPLIT1 LSEQ
 505                                          *      (atom)
 506                                          *      SPLIT1 LSEQ    ; <- the byte length of this instruction is needed
 507                                          *      (atom)         ; to encode the above SPLIT1 correctly
 508                                          *      ...
 509                                          *   LSEQ:
 510                                          */
 511                                         duk_uint32_t offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
 512                                         while (tmp_qmax > 0) {
 513                                                 duk__insert_slice(re_ctx, offset, atom_start_offset, atom_code_length);
 514                                                 if (re_ctx->curr_token.greedy) {
 515                                                         duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT1);   /* prefer direct */
 516                                                 } else {
 517                                                         duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT2);   /* prefer jump */
 518                                                 }
 519                                                 duk__insert_jump_offset(re_ctx,
 520                                                                         offset + 1,   /* +1 for opcode */
 521                                                                         (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
 522                                                 tmp_qmax--;
 523                                         }
 524                                 }
 525
 526                                 /* remove the original 'template' atom */
 527                                 duk__remove_slice(re_ctx, atom_start_offset, atom_code_length);
 528                         }
 529
 530                         /* 'taint' result as complex */
 531                         res_charlen = -1;
 532                         break;
 533                 }
 534                 case DUK_RETOK_ASSERT_START: {
 535                         duk__append_u32(re_ctx, DUK_REOP_ASSERT_START);
 536                         break;
 537                 }
 538                 case DUK_RETOK_ASSERT_END: {
 539                         duk__append_u32(re_ctx, DUK_REOP_ASSERT_END);
 540                         break;
 541                 }
 542                 case DUK_RETOK_ASSERT_WORD_BOUNDARY: {
 543                         duk__append_u32(re_ctx, DUK_REOP_ASSERT_WORD_BOUNDARY);
 544                         break;
 545                 }
 546                 case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY: {
 547                         duk__append_u32(re_ctx, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
 548                         break;
 549                 }
 550                 case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD:
 551                 case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD: {
 552                         duk_uint32_t offset;
 553                         duk_uint32_t opcode = (re_ctx->curr_token.t == DUK_RETOK_ASSERT_START_POS_LOOKAHEAD) ?
 554                                               DUK_REOP_LOOKPOS : DUK_REOP_LOOKNEG;
 555
 556                         offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
 557                         duk__parse_disjunction(re_ctx, 0, &tmp_disj);
 558                         duk__append_u32(re_ctx, DUK_REOP_MATCH);
 559
 560                         (void) duk__insert_u32(re_ctx, offset, opcode);
 561                         (void) duk__insert_jump_offset(re_ctx,
 562                                                        offset + 1,   /* +1 for opcode */
 563                                                        (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
 564
 565                         /* 'taint' result as complex -- this is conservative,
 566                          * as lookaheads do not backtrack.
 567                          */
 568                         res_charlen = -1;
 569                         break;
 570                 }
 571                 case DUK_RETOK_ATOM_PERIOD: {
 572                         new_atom_char_length = 1;
 573                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 574                         duk__append_u32(re_ctx, DUK_REOP_PERIOD);
 575                         break;
 576                 }
 577                 case DUK_RETOK_ATOM_CHAR: {
 578                         /* Note: successive characters could be joined into string matches
 579                          * but this is not trivial (consider e.g. '/xyz+/); see docs for
 580                          * more discussion.
 581                          */
 582                         duk_uint32_t ch;
 583
 584                         new_atom_char_length = 1;
 585                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 586                         duk__append_u32(re_ctx, DUK_REOP_CHAR);
 587                         ch = re_ctx->curr_token.num;
 588                         if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
 589                                 ch = duk_unicode_re_canonicalize_char(re_ctx->thr, ch);
 590                         }
 591                         duk__append_u32(re_ctx, ch);
 592                         break;
 593                 }
 594                 case DUK_RETOK_ATOM_DIGIT:
 595                 case DUK_RETOK_ATOM_NOT_DIGIT: {
 596                         new_atom_char_length = 1;
 597                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 598                         duk__append_u32(re_ctx,
 599                                         (re_ctx->curr_token.t == DUK_RETOK_ATOM_DIGIT) ?
 600                                         DUK_REOP_RANGES : DUK_REOP_INVRANGES);
 601                         duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_digit) / (2 * sizeof(duk_uint16_t)));
 602                         duk__append_u16_list(re_ctx, duk_unicode_re_ranges_digit, sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t));
 603                         break;
 604                 }
 605                 case DUK_RETOK_ATOM_WHITE:
 606                 case DUK_RETOK_ATOM_NOT_WHITE: {
 607                         new_atom_char_length = 1;
 608                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 609                         duk__append_u32(re_ctx,
 610                                         (re_ctx->curr_token.t == DUK_RETOK_ATOM_WHITE) ?
 611                                         DUK_REOP_RANGES : DUK_REOP_INVRANGES);
 612                         duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_white) / (2 * sizeof(duk_uint16_t)));
 613                         duk__append_u16_list(re_ctx, duk_unicode_re_ranges_white, sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t));
 614                         break;
 615                 }
 616                 case DUK_RETOK_ATOM_WORD_CHAR:
 617                 case DUK_RETOK_ATOM_NOT_WORD_CHAR: {
 618                         new_atom_char_length = 1;
 619                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 620                         duk__append_u32(re_ctx,
 621                                         (re_ctx->curr_token.t == DUK_RETOK_ATOM_WORD_CHAR) ?
 622                                         DUK_REOP_RANGES : DUK_REOP_INVRANGES);
 623                         duk__append_u32(re_ctx, sizeof(duk_unicode_re_ranges_wordchar) / (2 * sizeof(duk_uint16_t)));
 624                         duk__append_u16_list(re_ctx, duk_unicode_re_ranges_wordchar, sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t));
 625                         break;
 626                 }
 627                 case DUK_RETOK_ATOM_BACKREFERENCE: {
 628                         duk_uint32_t backref = (duk_uint32_t) re_ctx->curr_token.num;
 629                         if (backref > re_ctx->highest_backref) {
 630                                 re_ctx->highest_backref = backref;
 631                         }
 632                         new_atom_char_length = -1;   /* mark as complex */
 633                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 634                         duk__append_u32(re_ctx, DUK_REOP_BACKREFERENCE);
 635                         duk__append_u32(re_ctx, backref);
 636                         break;
 637                 }
 638                 case DUK_RETOK_ATOM_START_CAPTURE_GROUP: {
 639                         duk_uint32_t cap;
 640
 641                         new_atom_char_length = -1;   /* mark as complex (capture handling) */
 642                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 643                         cap = ++re_ctx->captures;
 644                         duk__append_u32(re_ctx, DUK_REOP_SAVE);
 645                         duk__append_u32(re_ctx, cap * 2);
 646                         duk__parse_disjunction(re_ctx, 0, &tmp_disj);  /* retval (sub-atom char length) unused, tainted as complex above */
 647                         duk__append_u32(re_ctx, DUK_REOP_SAVE);
 648                         duk__append_u32(re_ctx, cap * 2 + 1);
 649                         break;
 650                 }
 651                 case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP: {
 652                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 653                         duk__parse_disjunction(re_ctx, 0, &tmp_disj);
 654                         new_atom_char_length = tmp_disj.charlen;
 655                         break;
 656                 }
 657                 case DUK_RETOK_ATOM_START_CHARCLASS:
 658                 case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED: {
 659                         /*
 660                          *  Range parsing is done with a special lexer function which calls
 661                          *  us for every range parsed.  This is different from how rest of
 662                          *  the parsing works, but avoids a heavy, arbitrary size intermediate
 663                          *  value type to hold the ranges.
 664                          *
 665                          *  Another complication is the handling of character ranges when
 666                          *  case insensitive matching is used (see docs for discussion).
 667                          *  The range handler callback given to the lexer takes care of this
 668                          *  as well.
 669                          *
 670                          *  Note that duplicate ranges are not eliminated when parsing character
 671                          *  classes, so that canonicalization of
 672                          *
 673                          *    [0-9a-fA-Fx-{]
 674                          *
 675                          *  creates the result (note the duplicate ranges):
 676                          *
 677                          *    [0-9A-FA-FX-Z{-{]
 678                          *
 679                          *  where [x-{] is split as a result of canonicalization.  The duplicate
 680                          *  ranges are not a semantics issue: they work correctly.
 681                          */
 682
 683                         duk_uint32_t offset;
 684
 685                         DUK_DD(DUK_DDPRINT("character class"));
 686
 687                         /* insert ranges instruction, range count patched in later */
 688                         new_atom_char_length = 1;
 689                         new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
 690                         duk__append_u32(re_ctx,
 691                                         (re_ctx->curr_token.t == DUK_RETOK_ATOM_START_CHARCLASS) ?
 692                                         DUK_REOP_RANGES : DUK_REOP_INVRANGES);
 693                         offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);    /* patch in range count later */
 694
 695                         /* parse ranges until character class ends */
 696                         re_ctx->nranges = 0;    /* note: ctx-wide temporary */
 697                         duk_lexer_parse_re_ranges(&re_ctx->lex, duk__generate_ranges, (void *) re_ctx);
 698
 699                         /* insert range count */
 700                         duk__insert_u32(re_ctx, offset, re_ctx->nranges);
 701                         break;
 702                 }
 703                 case DUK_RETOK_ATOM_END_GROUP: {
 704                         if (expect_eof) {
 705                                 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_CLOSING_PAREN);
 706                         }
 707                         goto done;
 708                 }
 709                 case DUK_RETOK_EOF: {
 710                         if (!expect_eof) {
 711                                 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_END_OF_PATTERN);
 712                         }
 713                         goto done;
 714                 }
 715                 default: {
 716                         DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_REGEXP_TOKEN);
 717                 }
 718                 }
 719
 720                 /* a complex (new) atom taints the result */
 721                 if (new_atom_start_offset >= 0) {
 722                         if (new_atom_char_length < 0) {
 723                                 res_charlen = -1;
 724                         } else if (res_charlen >= 0) {
 725                                 /* only advance if not tainted */
 726                                 res_charlen += new_atom_char_length;
 727                         }
 728                 }
 729
 730                 /* record previous atom info in case next token is a quantifier */
 731                 atom_start_offset = new_atom_start_offset;
 732                 atom_char_length = new_atom_char_length;
 733                 atom_start_captures = new_atom_start_captures;
 734         }
 735
 736  done:
 737
 738         /* finish up pending jump and split for last alternative */
 739         if (unpatched_disjunction_jump >= 0) {
 740                 duk_uint32_t offset;
 741
 742                 DUK_ASSERT(unpatched_disjunction_split >= 0);
 743                 offset = unpatched_disjunction_jump;
 744                 offset += duk__insert_jump_offset(re_ctx,
 745                                                   offset,
 746                                                   (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
 747                 /* offset is now target of the pending split (right after jump) */
 748                 duk__insert_jump_offset(re_ctx,
 749                                         unpatched_disjunction_split,
 750                                         offset - unpatched_disjunction_split);
 751         }
 752
 753 #if 0
 754         out_atom_info->end_captures = re_ctx->captures;
 755 #endif
 756         out_atom_info->charlen = res_charlen;
 757         DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld",
 758                              (long) out_atom_info->charlen));
 759
 760         re_ctx->recursion_depth--;
 761 }
 762
 763 /*
 764  *  Flags parsing (see E5 Section 15.10.4.1).
 765  */
 766
 767 DUK_LOCAL duk_uint32_t duk__parse_regexp_flags(duk_hthread *thr, duk_hstring *h) {
 768         const duk_uint8_t *p;
 769         const duk_uint8_t *p_end;
 770         duk_uint32_t flags = 0;
 771
 772         p = DUK_HSTRING_GET_DATA(h);
 773         p_end = p + DUK_HSTRING_GET_BYTELEN(h);
 774
 775         /* Note: can be safely scanned as bytes (undecoded) */
 776
 777         while (p < p_end) {
 778                 duk_uint8_t c = *p++;
 779                 switch ((int) c) {
 780                 case (int) 'g': {
 781                         if (flags & DUK_RE_FLAG_GLOBAL) {
 782                                 goto error;
 783                         }
 784                         flags |= DUK_RE_FLAG_GLOBAL;
 785                         break;
 786                 }
 787                 case (int) 'i': {
 788                         if (flags & DUK_RE_FLAG_IGNORE_CASE) {
 789                                 goto error;
 790                         }
 791                         flags |= DUK_RE_FLAG_IGNORE_CASE;
 792                         break;
 793                 }
 794                 case (int) 'm': {
 795                         if (flags & DUK_RE_FLAG_MULTILINE) {
 796                                 goto error;
 797                         }
 798                         flags |= DUK_RE_FLAG_MULTILINE;
 799                         break;
 800                 }
 801                 default: {
 802                         goto error;
 803                 }
 804                 }
 805         }
 806
 807         return flags;
 808
 809  error:
 810         DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_REGEXP_FLAGS);
 811         return 0;  /* never here */
 812 }
 813
 814 /*
 815  *  Create escaped RegExp source (E5 Section 15.10.3).
 816  *
 817  *  The current approach is to special case the empty RegExp
 818  *  ('' -> '(?:)') and otherwise replace unescaped '/' characters
 819  *  with '\/' regardless of where they occur in the regexp.
 820  *
 821  *  Note that normalization does not seem to be necessary for
 822  *  RegExp literals (e.g. '/foo/') because to be acceptable as
 823  *  a RegExp literal, the text between forward slashes must
 824  *  already match the escaping requirements (e.g. must not contain
 825  *  unescaped forward slashes or be empty).  Escaping IS needed
 826  *  for expressions like 'new Regexp("...", "")' however.
 827  *  Currently, we re-escape in either case.
 828  *
 829  *  Also note that we process the source here in UTF-8 encoded
 830  *  form.  This is correct, because any non-ASCII characters are
 831  *  passed through without change.
 832  */
 833
 834 DUK_LOCAL void duk__create_escaped_source(duk_hthread *thr, int idx_pattern) {
 835         duk_context *ctx = (duk_context *) thr;
 836         duk_hstring *h;
 837         const duk_uint8_t *p;
 838         duk_bufwriter_ctx bw_alloc;
 839         duk_bufwriter_ctx *bw;
 840         duk_uint8_t *q;
 841         duk_size_t i, n;
 842         duk_uint_fast8_t c_prev, c;
 843
 844         h = duk_get_hstring(ctx, idx_pattern);
 845         DUK_ASSERT(h != NULL);
 846         p = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);
 847         n = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h);
 848
 849         if (n == 0) {
 850                 /* return '(?:)' */
 851                 duk_push_hstring_stridx(ctx, DUK_STRIDX_ESCAPED_EMPTY_REGEXP);
 852                 return;
 853         }
 854
 855         bw = &bw_alloc;
 856         DUK_BW_INIT_PUSHBUF(thr, bw, n);
 857         q = DUK_BW_GET_PTR(thr, bw);
 858
 859         c_prev = (duk_uint_fast8_t) 0;
 860
 861         for (i = 0; i < n; i++) {
 862                 c = p[i];
 863
 864                 q = DUK_BW_ENSURE_RAW(thr, bw, 2, q);
 865
 866                 if (c == (duk_uint_fast8_t) '/' && c_prev != (duk_uint_fast8_t) '\\') {
 867                         /* Unescaped '/' ANYWHERE in the regexp (in disjunction,
 868                          * inside a character class, ...) => same escape works.
 869                          */
 870                         *q++ = DUK_ASC_BACKSLASH;
 871                 }
 872                 *q++ = (duk_uint8_t) c;
 873
 874                 c_prev = c;
 875         }
 876
 877         DUK_BW_SETPTR_AND_COMPACT(thr, bw, q);
 878         duk_to_string(ctx, -1);  /* -> [ ... escaped_source ] */
 879 }
 880
 881 /*
 882  *  Exposed regexp compilation primitive.
 883  *
 884  *  Sets up a regexp compilation context, and calls duk__parse_disjunction() to do the
 885  *  actual parsing.  Handles generation of the compiled regexp header and the
 886  *  "boilerplate" capture of the matching substring (save 0 and 1).  Also does some
 887  *  global level regexp checks after recursive compilation has finished.
 888  *
 889  *  An escaped version of the regexp source, suitable for use as a RegExp instance
 890  *  'source' property (see E5 Section 15.10.3), is also left on the stack.
 891  *
 892  *  Input stack:  [ pattern flags ]
 893  *  Output stack: [ bytecode escaped_source ]  (both as strings)
 894  */
 895
 896 DUK_INTERNAL void duk_regexp_compile(duk_hthread *thr) {
 897         duk_context *ctx = (duk_context *) thr;
 898         duk_re_compiler_ctx re_ctx;
 899         duk_lexer_point lex_point;
 900         duk_hstring *h_pattern;
 901         duk_hstring *h_flags;
 902         duk__re_disjunction_info ign_disj;
 903
 904         DUK_ASSERT(thr != NULL);
 905         DUK_ASSERT(ctx != NULL);
 906
 907         /*
 908          *  Args validation
 909          */
 910
 911         /* TypeError if fails */
 912         h_pattern = duk_require_hstring(ctx, -2);
 913         h_flags = duk_require_hstring(ctx, -1);
 914
 915         /*
 916          *  Create normalized 'source' property (E5 Section 15.10.3).
 917          */
 918
 919         /* [ ... pattern flags ] */
 920
 921         duk__create_escaped_source(thr, -2);
 922
 923         /* [ ... pattern flags escaped_source ] */
 924
 925         /*
 926          *  Init compilation context
 927          */
 928
 929         /* [ ... pattern flags escaped_source buffer ] */
 930
 931         DUK_MEMZERO(&re_ctx, sizeof(re_ctx));
 932         DUK_LEXER_INITCTX(&re_ctx.lex);  /* duplicate zeroing, expect for (possible) NULL inits */
 933         re_ctx.thr = thr;
 934         re_ctx.lex.thr = thr;
 935         re_ctx.lex.input = DUK_HSTRING_GET_DATA(h_pattern);
 936         re_ctx.lex.input_length = DUK_HSTRING_GET_BYTELEN(h_pattern);
 937         re_ctx.lex.token_limit = DUK_RE_COMPILE_TOKEN_LIMIT;
 938         re_ctx.recursion_limit = DUK_USE_REGEXP_COMPILER_RECLIMIT;
 939         re_ctx.re_flags = duk__parse_regexp_flags(thr, h_flags);
 940
 941         DUK_BW_INIT_PUSHBUF(thr, &re_ctx.bw, DUK__RE_INITIAL_BUFSIZE);
 942
 943         DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld",
 944                            (unsigned long) re_ctx.re_flags, (long) re_ctx.recursion_limit));
 945
 946         /*
 947          *  Init lexer
 948          */
 949
 950         lex_point.offset = 0;  /* expensive init, just want to fill window */
 951         lex_point.line = 1;
 952         DUK_LEXER_SETPOINT(&re_ctx.lex, &lex_point);
 953
 954         /*
 955          *  Compilation
 956          */
 957
 958         DUK_DD(DUK_DDPRINT("starting regexp compilation"));
 959
 960         duk__append_u32(&re_ctx, DUK_REOP_SAVE);
 961         duk__append_u32(&re_ctx, 0);
 962         duk__parse_disjunction(&re_ctx, 1 /*expect_eof*/, &ign_disj);
 963         duk__append_u32(&re_ctx, DUK_REOP_SAVE);
 964         duk__append_u32(&re_ctx, 1);
 965         duk__append_u32(&re_ctx, DUK_REOP_MATCH);
 966
 967         /*
 968          *  Check for invalid backreferences; note that it is NOT an error
 969          *  to back-reference a capture group which has not yet been introduced
 970          *  in the pattern (as in /\1(foo)/); in fact, the backreference will
 971          *  always match!  It IS an error to back-reference a capture group
 972          *  which will never be introduced in the pattern.  Thus, we can check
 973          *  for such references only after parsing is complete.
 974          */
 975
 976         if (re_ctx.highest_backref > re_ctx.captures) {
 977                 DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_BACKREFS);
 978         }
 979
 980         /*
 981          *  Emit compiled regexp header: flags, ncaptures
 982          *  (insertion order inverted on purpose)
 983          */
 984
 985         duk__insert_u32(&re_ctx, 0, (re_ctx.captures + 1) * 2);
 986         duk__insert_u32(&re_ctx, 0, re_ctx.re_flags);
 987
 988         /* [ ... pattern flags escaped_source buffer ] */
 989
 990         DUK_BW_COMPACT(thr, &re_ctx.bw);
 991         duk_to_string(ctx, -1);  /* coerce to string */
 992
 993         /* [ ... pattern flags escaped_source bytecode ] */
 994
 995         /*
 996          *  Finalize stack
 997          */
 998
 999         duk_remove(ctx, -4);     /* -> [ ... flags escaped_source bytecode ] */
1000         duk_remove(ctx, -3);     /* -> [ ... escaped_source bytecode ] */
1001
1002         DUK_DD(DUK_DDPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T",
1003                            (duk_tval *) duk_get_tval(ctx, -1), (duk_tval *) duk_get_tval(ctx, -2)));
1004 }
1005
1006 /*
1007  *  Create a RegExp instance (E5 Section 15.10.7).
1008  *
1009  *  Note: the output stack left by duk_regexp_compile() is directly compatible
1010  *  with the input here.
1011  *
1012  *  Input stack:  [ escaped_source bytecode ]  (both as strings)
1013  *  Output stack: [ RegExp ]
1014  */
1015
1016 DUK_INTERNAL void duk_regexp_create_instance(duk_hthread *thr) {
1017         duk_context *ctx = (duk_context *) thr;
1018         duk_hobject *h;
1019         duk_hstring *h_bc;
1020         duk_small_int_t re_flags;
1021
1022         /* [ ... escape_source bytecode ] */
1023
1024         h_bc = duk_get_hstring(ctx, -1);
1025         DUK_ASSERT(h_bc != NULL);
1026         DUK_ASSERT(DUK_HSTRING_GET_BYTELEN(h_bc) >= 1);          /* always at least the header */
1027         DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h_bc) >= 1);
1028         DUK_ASSERT((duk_small_int_t) DUK_HSTRING_GET_DATA(h_bc)[0] < 0x80);  /* flags always encodes to 1 byte */
1029         re_flags = (duk_small_int_t) DUK_HSTRING_GET_DATA(h_bc)[0];
1030
1031         /* [ ... escaped_source bytecode ] */
1032
1033         duk_push_object(ctx);
1034         h = duk_get_hobject(ctx, -1);
1035         DUK_ASSERT(h != NULL);
1036         duk_insert(ctx, -3);
1037
1038         /* [ ... regexp_object escaped_source bytecode ] */
1039
1040         DUK_HOBJECT_SET_CLASS_NUMBER(h, DUK_HOBJECT_CLASS_REGEXP);
1041         DUK_HOBJECT_SET_PROTOTYPE_UPDREF(thr, h, thr->builtins[DUK_BIDX_REGEXP_PROTOTYPE]);
1042
1043         duk_xdef_prop_stridx(ctx, -3, DUK_STRIDX_INT_BYTECODE, DUK_PROPDESC_FLAGS_NONE);
1044
1045         /* [ ... regexp_object escaped_source ] */
1046
1047         duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_SOURCE, DUK_PROPDESC_FLAGS_NONE);
1048
1049         /* [ ... regexp_object ] */
1050
1051         duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_GLOBAL));
1052         duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_GLOBAL, DUK_PROPDESC_FLAGS_NONE);
1053
1054         duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_IGNORE_CASE));
1055         duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_IGNORE_CASE, DUK_PROPDESC_FLAGS_NONE);
1056
1057         duk_push_boolean(ctx, (re_flags & DUK_RE_FLAG_MULTILINE));
1058         duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_MULTILINE, DUK_PROPDESC_FLAGS_NONE);
1059
1060         duk_push_int(ctx, 0);
1061         duk_xdef_prop_stridx(ctx, -2, DUK_STRIDX_LAST_INDEX, DUK_PROPDESC_FLAGS_W);
1062
1063         /* [ ... regexp_object ] */
1064 }
1065
1066 #undef DUK__RE_BUFLEN
1067
1068 #else  /* DUK_USE_REGEXP_SUPPORT */
1069
1070 /* regexp support disabled */
1071
1072 #endif  /* DUK_USE_REGEXP_SUPPORT */