ceph/src/civetweb/src/third_party/duktape-1.3.0/src-separate/duk_regexp_executor.c

   1 /*
   2  *  Regexp executor.
   3  *
   4  *  Safety: the Ecmascript executor should prevent user from reading and
   5  *  replacing regexp bytecode.  Even so, the executor must validate all
   6  *  memory accesses etc.  When an invalid access is detected (e.g. a 'save'
   7  *  opcode to invalid, unallocated index) it should fail with an internal
   8  *  error but not cause a segmentation fault.
   9  *
  10  *  Notes:
  11  *
  12  *    - Backtrack counts are limited to unsigned 32 bits but should
  13  *      technically be duk_size_t for strings longer than 4G chars.
  14  *      This also requires a regexp bytecode change.
  15  */
  16
  17 #include "duk_internal.h"
  18
  19 #ifdef DUK_USE_REGEXP_SUPPORT
  20
  21 /*
  22  *  Helpers for UTF-8 handling
  23  *
  24  *  For bytecode readers the duk_uint32_t and duk_int32_t types are correct
  25  *  because they're used for more than just codepoints.
  26  */
  27
  28 DUK_LOCAL duk_uint32_t duk__bc_get_u32(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **pc) {
  29         return (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
  30 }
  31
  32 DUK_LOCAL duk_int32_t duk__bc_get_i32(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **pc) {
  33         duk_uint32_t t;
  34
  35         /* signed integer encoding needed to work with UTF-8 */
  36         t = (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
  37         if (t & 1) {
  38                 return -((duk_int32_t) (t >> 1));
  39         } else {
  40                 return (duk_int32_t) (t >> 1);
  41         }
  42 }
  43
  44 DUK_LOCAL const duk_uint8_t *duk__utf8_backtrack(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_uint_fast32_t count) {
  45         const duk_uint8_t *p;
  46
  47         /* Note: allow backtracking from p == ptr_end */
  48         p = *ptr;
  49         if (p < ptr_start || p > ptr_end) {
  50                 goto fail;
  51         }
  52
  53         while (count > 0) {
  54                 for (;;) {
  55                         p--;
  56                         if (p < ptr_start) {
  57                                 goto fail;
  58                         }
  59                         if ((*p & 0xc0) != 0x80) {
  60                                 /* utf-8 continuation bytes have the form 10xx xxxx */
  61                                 break;
  62                         }
  63                 }
  64                 count--;
  65         }
  66         *ptr = p;
  67         return p;
  68
  69  fail:
  70         DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, DUK_STR_REGEXP_BACKTRACK_FAILED);
  71         return NULL;  /* never here */
  72 }
  73
  74 DUK_LOCAL const duk_uint8_t *duk__utf8_advance(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_uint_fast32_t count) {
  75         const duk_uint8_t *p;
  76
  77         p = *ptr;
  78         if (p < ptr_start || p >= ptr_end) {
  79                 goto fail;
  80         }
  81
  82         while (count > 0) {
  83                 for (;;) {
  84                         p++;
  85
  86                         /* Note: if encoding ends by hitting end of input, we don't check that
  87                          * the encoding is valid, we just assume it is.
  88                          */
  89                         if (p >= ptr_end || ((*p & 0xc0) != 0x80)) {
  90                                 /* utf-8 continuation bytes have the form 10xx xxxx */
  91                                 break;
  92                         }
  93                 }
  94                 count--;
  95         }
  96
  97         *ptr = p;
  98         return p;
  99
 100  fail:
 101         DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, DUK_STR_REGEXP_ADVANCE_FAILED);
 102         return NULL;  /* never here */
 103 }
 104
 105 /*
 106  *  Helpers for dealing with the input string
 107  */
 108
 109 /* Get a (possibly canonicalized) input character from current sp.  The input
 110  * itself is never modified, and captures always record non-canonicalized
 111  * characters even in case-insensitive matching.
 112  */
 113 DUK_LOCAL duk_codepoint_t duk__inp_get_cp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **sp) {
 114         duk_codepoint_t res = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end);
 115         if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
 116                 res = duk_unicode_re_canonicalize_char(re_ctx->thr, res);
 117         }
 118         return res;
 119 }
 120
 121 DUK_LOCAL const duk_uint8_t *duk__inp_backtrack(duk_re_matcher_ctx *re_ctx, const duk_uint8_t **sp, duk_uint_fast32_t count) {
 122         return duk__utf8_backtrack(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end, count);
 123 }
 124
 125 /* Backtrack utf-8 input and return a (possibly canonicalized) input character. */
 126 DUK_LOCAL duk_codepoint_t duk__inp_get_prev_cp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t *sp) {
 127         /* note: caller 'sp' is intentionally not updated here */
 128         (void) duk__inp_backtrack(re_ctx, &sp, (duk_uint_fast32_t) 1);
 129         return duk__inp_get_cp(re_ctx, &sp);
 130 }
 131
 132 /*
 133  *  Regexp recursive matching function.
 134  *
 135  *  Returns 'sp' on successful match (points to character after last matched one),
 136  *  NULL otherwise.
 137  *
 138  *  The C recursion depth limit check is only performed in this function, this
 139  *  suffices because the function is present in all true recursion required by
 140  *  regexp execution.
 141  */
 142
 143 DUK_LOCAL const duk_uint8_t *duk__match_regexp(duk_re_matcher_ctx *re_ctx, const duk_uint8_t *pc, const duk_uint8_t *sp) {
 144         if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
 145                 DUK_ERROR(re_ctx->thr, DUK_ERR_RANGE_ERROR, DUK_STR_REGEXP_EXECUTOR_RECURSION_LIMIT);
 146         }
 147         re_ctx->recursion_depth++;
 148
 149         for (;;) {
 150                 duk_small_int_t op;
 151
 152                 if (re_ctx->steps_count >= re_ctx->steps_limit) {
 153                         DUK_ERROR(re_ctx->thr, DUK_ERR_RANGE_ERROR, DUK_STR_REGEXP_EXECUTOR_STEP_LIMIT);
 154                 }
 155                 re_ctx->steps_count++;
 156
 157                 op = (duk_small_int_t) duk__bc_get_u32(re_ctx, &pc);
 158
 159                 DUK_DDD(DUK_DDDPRINT("match: rec=%ld, steps=%ld, pc (after op)=%ld, sp=%ld, op=%ld",
 160                                      (long) re_ctx->recursion_depth,
 161                                      (long) re_ctx->steps_count,
 162                                      (long) (pc - re_ctx->bytecode),
 163                                      (long) (sp - re_ctx->input),
 164                                      (long) op));
 165
 166                 switch (op) {
 167                 case DUK_REOP_MATCH: {
 168                         goto match;
 169                 }
 170                 case DUK_REOP_CHAR: {
 171                         /*
 172                          *  Byte-based matching would be possible for case-sensitive
 173                          *  matching but not for case-insensitive matching.  So, we
 174                          *  match by decoding the input and bytecode character normally.
 175                          *
 176                          *  Bytecode characters are assumed to be already canonicalized.
 177                          *  Input characters are canonicalized automatically by
 178                          *  duk__inp_get_cp() if necessary.
 179                          *
 180                          *  There is no opcode for matching multiple characters.  The
 181                          *  regexp compiler has trouble joining strings efficiently
 182                          *  during compilation.  See doc/regexp.rst for more discussion.
 183                          */
 184                         duk_codepoint_t c1, c2;
 185
 186                         c1 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
 187                         DUK_ASSERT(!(re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) ||
 188                                    c1 == duk_unicode_re_canonicalize_char(re_ctx->thr, c1));  /* canonicalized by compiler */
 189                         if (sp >= re_ctx->input_end) {
 190                                 goto fail;
 191                         }
 192                         c2 = duk__inp_get_cp(re_ctx, &sp);
 193                         DUK_DDD(DUK_DDDPRINT("char match, c1=%ld, c2=%ld", (long) c1, (long) c2));
 194                         if (c1 != c2) {
 195                                 goto fail;
 196                         }
 197                         break;
 198                 }
 199                 case DUK_REOP_PERIOD: {
 200                         duk_codepoint_t c;
 201
 202                         if (sp >= re_ctx->input_end) {
 203                                 goto fail;
 204                         }
 205                         c = duk__inp_get_cp(re_ctx, &sp);
 206                         if (duk_unicode_is_line_terminator(c)) {
 207                                 /* E5 Sections 15.10.2.8, 7.3 */
 208                                 goto fail;
 209                         }
 210                         break;
 211                 }
 212                 case DUK_REOP_RANGES:
 213                 case DUK_REOP_INVRANGES: {
 214                         duk_uint32_t n;
 215                         duk_codepoint_t c;
 216                         duk_small_int_t match;
 217
 218                         n = duk__bc_get_u32(re_ctx, &pc);
 219                         if (sp >= re_ctx->input_end) {
 220                                 goto fail;
 221                         }
 222                         c = duk__inp_get_cp(re_ctx, &sp);
 223
 224                         match = 0;
 225                         while (n) {
 226                                 duk_codepoint_t r1, r2;
 227                                 r1 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
 228                                 r2 = (duk_codepoint_t) duk__bc_get_u32(re_ctx, &pc);
 229                                 DUK_DDD(DUK_DDDPRINT("matching ranges/invranges, n=%ld, r1=%ld, r2=%ld, c=%ld",
 230                                                      (long) n, (long) r1, (long) r2, (long) c));
 231                                 if (c >= r1 && c <= r2) {
 232                                         /* Note: don't bail out early, we must read all the ranges from
 233                                          * bytecode.  Another option is to skip them efficiently after
 234                                          * breaking out of here.  Prefer smallest code.
 235                                          */
 236                                         match = 1;
 237                                 }
 238                                 n--;
 239                         }
 240
 241                         if (op == DUK_REOP_RANGES) {
 242                                 if (!match) {
 243                                         goto fail;
 244                                 }
 245                         } else {
 246                                 DUK_ASSERT(op == DUK_REOP_INVRANGES);
 247                                 if (match) {
 248                                         goto fail;
 249                                 }
 250                         }
 251                         break;
 252                 }
 253                 case DUK_REOP_ASSERT_START: {
 254                         duk_codepoint_t c;
 255
 256                         if (sp <= re_ctx->input) {
 257                                 break;
 258                         }
 259                         if (!(re_ctx->re_flags & DUK_RE_FLAG_MULTILINE)) {
 260                                 goto fail;
 261                         }
 262                         c = duk__inp_get_prev_cp(re_ctx, sp);
 263                         if (duk_unicode_is_line_terminator(c)) {
 264                                 /* E5 Sections 15.10.2.8, 7.3 */
 265                                 break;
 266                         }
 267                         goto fail;
 268                 }
 269                 case DUK_REOP_ASSERT_END: {
 270                         duk_codepoint_t c;
 271                         const duk_uint8_t *tmp_sp;
 272
 273                         if (sp >= re_ctx->input_end) {
 274                                 break;
 275                         }
 276                         if (!(re_ctx->re_flags & DUK_RE_FLAG_MULTILINE)) {
 277                                 goto fail;
 278                         }
 279                         tmp_sp = sp;
 280                         c = duk__inp_get_cp(re_ctx, &tmp_sp);
 281                         if (duk_unicode_is_line_terminator(c)) {
 282                                 /* E5 Sections 15.10.2.8, 7.3 */
 283                                 break;
 284                         }
 285                         goto fail;
 286                 }
 287                 case DUK_REOP_ASSERT_WORD_BOUNDARY:
 288                 case DUK_REOP_ASSERT_NOT_WORD_BOUNDARY: {
 289                         /*
 290                          *  E5 Section 15.10.2.6.  The previous and current character
 291                          *  should -not- be canonicalized as they are now.  However,
 292                          *  canonicalization does not affect the result of IsWordChar()
 293                          *  (which depends on Unicode characters never canonicalizing
 294                          *  into ASCII characters) so this does not matter.
 295                          */
 296                         duk_small_int_t w1, w2;
 297
 298                         if (sp <= re_ctx->input) {
 299                                 w1 = 0;  /* not a wordchar */
 300                         } else {
 301                                 duk_codepoint_t c;
 302                                 c = duk__inp_get_prev_cp(re_ctx, sp);
 303                                 w1 = duk_unicode_re_is_wordchar(c);
 304                         }
 305                         if (sp >= re_ctx->input_end) {
 306                                 w2 = 0;  /* not a wordchar */
 307                         } else {
 308                                 const duk_uint8_t *tmp_sp = sp;  /* dummy so sp won't get updated */
 309                                 duk_codepoint_t c;
 310                                 c = duk__inp_get_cp(re_ctx, &tmp_sp);
 311                                 w2 = duk_unicode_re_is_wordchar(c);
 312                         }
 313
 314                         if (op == DUK_REOP_ASSERT_WORD_BOUNDARY) {
 315                                 if (w1 == w2) {
 316                                         goto fail;
 317                                 }
 318                         } else {
 319                                 DUK_ASSERT(op == DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
 320                                 if (w1 != w2) {
 321                                         goto fail;
 322                                 }
 323                         }
 324                         break;
 325                 }
 326                 case DUK_REOP_JUMP: {
 327                         duk_int32_t skip;
 328
 329                         skip = duk__bc_get_i32(re_ctx, &pc);
 330                         pc += skip;
 331                         break;
 332                 }
 333                 case DUK_REOP_SPLIT1: {
 334                         /* split1: prefer direct execution (no jump) */
 335                         const duk_uint8_t *sub_sp;
 336                         duk_int32_t skip;
 337
 338                         skip = duk__bc_get_i32(re_ctx, &pc);
 339                         sub_sp = duk__match_regexp(re_ctx, pc, sp);
 340                         if (sub_sp) {
 341                                 sp = sub_sp;
 342                                 goto match;
 343                         }
 344                         pc += skip;
 345                         break;
 346                 }
 347                 case DUK_REOP_SPLIT2: {
 348                         /* split2: prefer jump execution (not direct) */
 349                         const duk_uint8_t *sub_sp;
 350                         duk_int32_t skip;
 351
 352                         skip = duk__bc_get_i32(re_ctx, &pc);
 353                         sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
 354                         if (sub_sp) {
 355                                 sp = sub_sp;
 356                                 goto match;
 357                         }
 358                         break;
 359                 }
 360                 case DUK_REOP_SQMINIMAL: {
 361                         duk_uint32_t q, qmin, qmax;
 362                         duk_int32_t skip;
 363                         const duk_uint8_t *sub_sp;
 364
 365                         qmin = duk__bc_get_u32(re_ctx, &pc);
 366                         qmax = duk__bc_get_u32(re_ctx, &pc);
 367                         skip = duk__bc_get_i32(re_ctx, &pc);
 368                         DUK_DDD(DUK_DDDPRINT("minimal quantifier, qmin=%lu, qmax=%lu, skip=%ld",
 369                                              (unsigned long) qmin, (unsigned long) qmax, (long) skip));
 370
 371                         q = 0;
 372                         while (q <= qmax) {
 373                                 if (q >= qmin) {
 374                                         sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
 375                                         if (sub_sp) {
 376                                                 sp = sub_sp;
 377                                                 goto match;
 378                                         }
 379                                 }
 380                                 sub_sp = duk__match_regexp(re_ctx, pc, sp);
 381                                 if (!sub_sp) {
 382                                         break;
 383                                 }
 384                                 sp = sub_sp;
 385                                 q++;
 386                         }
 387                         goto fail;
 388                 }
 389                 case DUK_REOP_SQGREEDY: {
 390                         duk_uint32_t q, qmin, qmax, atomlen;
 391                         duk_int32_t skip;
 392                         const duk_uint8_t *sub_sp;
 393
 394                         qmin = duk__bc_get_u32(re_ctx, &pc);
 395                         qmax = duk__bc_get_u32(re_ctx, &pc);
 396                         atomlen = duk__bc_get_u32(re_ctx, &pc);
 397                         skip = duk__bc_get_i32(re_ctx, &pc);
 398                         DUK_DDD(DUK_DDDPRINT("greedy quantifier, qmin=%lu, qmax=%lu, atomlen=%lu, skip=%ld",
 399                                              (unsigned long) qmin, (unsigned long) qmax, (unsigned long) atomlen, (long) skip));
 400
 401                         q = 0;
 402                         while (q < qmax) {
 403                                 sub_sp = duk__match_regexp(re_ctx, pc, sp);
 404                                 if (!sub_sp) {
 405                                         break;
 406                                 }
 407                                 sp = sub_sp;
 408                                 q++;
 409                         }
 410                         while (q >= qmin) {
 411                                 sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
 412                                 if (sub_sp) {
 413                                         sp = sub_sp;
 414                                         goto match;
 415                                 }
 416                                 if (q == qmin) {
 417                                         break;
 418                                 }
 419
 420                                 /* Note: if atom were to contain e.g. captures, we would need to
 421                                  * re-match the atom to get correct captures.  Simply quantifiers
 422                                  * do not allow captures in their atom now, so this is not an issue.
 423                                  */
 424
 425                                 DUK_DDD(DUK_DDDPRINT("greedy quantifier, backtrack %ld characters (atomlen)",
 426                                                      (long) atomlen));
 427                                 sp = duk__inp_backtrack(re_ctx, &sp, (duk_uint_fast32_t) atomlen);
 428                                 q--;
 429                         }
 430                         goto fail;
 431                 }
 432                 case DUK_REOP_SAVE: {
 433                         duk_uint32_t idx;
 434                         const duk_uint8_t *old;
 435                         const duk_uint8_t *sub_sp;
 436
 437                         idx = duk__bc_get_u32(re_ctx, &pc);
 438                         if (idx >= re_ctx->nsaved) {
 439                                 /* idx is unsigned, < 0 check is not necessary */
 440                                 DUK_D(DUK_DPRINT("internal error, regexp save index insane: idx=%ld", (long) idx));
 441                                 goto internal_error;
 442                         }
 443                         old = re_ctx->saved[idx];
 444                         re_ctx->saved[idx] = sp;
 445                         sub_sp = duk__match_regexp(re_ctx, pc, sp);
 446                         if (sub_sp) {
 447                                 sp = sub_sp;
 448                                 goto match;
 449                         }
 450                         re_ctx->saved[idx] = old;
 451                         goto fail;
 452                 }
 453                 case DUK_REOP_WIPERANGE: {
 454                         /* Wipe capture range and save old values for backtracking.
 455                          *
 456                          * XXX: this typically happens with a relatively small idx_count.
 457                          * It might be useful to handle cases where the count is small
 458                          * (say <= 8) by saving the values in stack instead.  This would
 459                          * reduce memory churn and improve performance, at the cost of a
 460                          * slightly higher code footprint.
 461                          */
 462                         duk_uint32_t idx_start, idx_count;
 463 #ifdef DUK_USE_EXPLICIT_NULL_INIT
 464                         duk_uint32_t idx_end, idx;
 465 #endif
 466                         duk_uint8_t **range_save;
 467                         const duk_uint8_t *sub_sp;
 468
 469                         idx_start = duk__bc_get_u32(re_ctx, &pc);
 470                         idx_count = duk__bc_get_u32(re_ctx, &pc);
 471                         DUK_DDD(DUK_DDDPRINT("wipe saved range: start=%ld, count=%ld -> [%ld,%ld] (captures [%ld,%ld])",
 472                                              (long) idx_start, (long) idx_count,
 473                                              (long) idx_start, (long) (idx_start + idx_count - 1),
 474                                              (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
 475                         if (idx_start + idx_count > re_ctx->nsaved || idx_count == 0) {
 476                                 /* idx is unsigned, < 0 check is not necessary */
 477                                 DUK_D(DUK_DPRINT("internal error, regexp wipe indices insane: idx_start=%ld, idx_count=%ld",
 478                                                  (long) idx_start, (long) idx_count));
 479                                 goto internal_error;
 480                         }
 481                         DUK_ASSERT(idx_count > 0);
 482
 483                         duk_require_stack((duk_context *) re_ctx->thr, 1);
 484                         range_save = (duk_uint8_t **) duk_push_fixed_buffer((duk_context *) re_ctx->thr,
 485                                                                             sizeof(duk_uint8_t *) * idx_count);
 486                         DUK_ASSERT(range_save != NULL);
 487                         DUK_MEMCPY(range_save, re_ctx->saved + idx_start, sizeof(duk_uint8_t *) * idx_count);
 488 #ifdef DUK_USE_EXPLICIT_NULL_INIT
 489                         idx_end = idx_start + idx_count;
 490                         for (idx = idx_start; idx < idx_end; idx++) {
 491                                 re_ctx->saved[idx] = NULL;
 492                         }
 493 #else
 494                         DUK_MEMZERO((void *) (re_ctx->saved + idx_start), sizeof(duk_uint8_t *) * idx_count);
 495 #endif
 496
 497                         sub_sp = duk__match_regexp(re_ctx, pc, sp);
 498                         if (sub_sp) {
 499                                 /* match: keep wiped/resaved values */
 500                                 DUK_DDD(DUK_DDDPRINT("match: keep wiped/resaved values [%ld,%ld] (captures [%ld,%ld])",
 501                                                      (long) idx_start, (long) (idx_start + idx_count - 1),
 502                                                      (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
 503                                 duk_pop((duk_context *) re_ctx->thr);
 504                                 sp = sub_sp;
 505                                 goto match;
 506                         }
 507
 508                         /* fail: restore saves */
 509                         DUK_DDD(DUK_DDDPRINT("fail: restore wiped/resaved values [%ld,%ld] (captures [%ld,%ld])",
 510                                              (long) idx_start, (long) (idx_start + idx_count - 1),
 511                                              (long) (idx_start / 2), (long) ((idx_start + idx_count - 1) / 2)));
 512                         DUK_MEMCPY((void *) (re_ctx->saved + idx_start),
 513                                    (const void *) range_save,
 514                                    sizeof(duk_uint8_t *) * idx_count);
 515                         duk_pop((duk_context *) re_ctx->thr);
 516                         goto fail;
 517                 }
 518                 case DUK_REOP_LOOKPOS:
 519                 case DUK_REOP_LOOKNEG: {
 520                         /*
 521                          *  Needs a save of multiple saved[] entries depending on what range
 522                          *  may be overwritten.  Because the regexp parser does no such analysis,
 523                          *  we currently save the entire saved array here.  Lookaheads are thus
 524                          *  a bit expensive.  Note that the saved array is not needed for just
 525                          *  the lookahead sub-match, but for the matching of the entire sequel.
 526                          *
 527                          *  The temporary save buffer is pushed on to the valstack to handle
 528                          *  errors correctly.  Each lookahead causes a C recursion and pushes
 529                          *  more stuff on the value stack.  If the C recursion limit is less
 530                          *  than the value stack spare, there is no need to check the stack.
 531                          *  We do so regardless, just in case.
 532                          */
 533
 534                         duk_int32_t skip;
 535                         duk_uint8_t **full_save;
 536                         const duk_uint8_t *sub_sp;
 537
 538                         DUK_ASSERT(re_ctx->nsaved > 0);
 539
 540                         duk_require_stack((duk_context *) re_ctx->thr, 1);
 541                         full_save = (duk_uint8_t **) duk_push_fixed_buffer((duk_context *) re_ctx->thr,
 542                                                                            sizeof(duk_uint8_t *) * re_ctx->nsaved);
 543                         DUK_ASSERT(full_save != NULL);
 544                         DUK_MEMCPY(full_save, re_ctx->saved, sizeof(duk_uint8_t *) * re_ctx->nsaved);
 545
 546                         skip = duk__bc_get_i32(re_ctx, &pc);
 547                         sub_sp = duk__match_regexp(re_ctx, pc, sp);
 548                         if (op == DUK_REOP_LOOKPOS) {
 549                                 if (!sub_sp) {
 550                                         goto lookahead_fail;
 551                                 }
 552                         } else {
 553                                 if (sub_sp) {
 554                                         goto lookahead_fail;
 555                                 }
 556                         }
 557                         sub_sp = duk__match_regexp(re_ctx, pc + skip, sp);
 558                         if (sub_sp) {
 559                                 /* match: keep saves */
 560                                 duk_pop((duk_context *) re_ctx->thr);
 561                                 sp = sub_sp;
 562                                 goto match;
 563                         }
 564
 565                         /* fall through */
 566
 567                  lookahead_fail:
 568                         /* fail: restore saves */
 569                         DUK_MEMCPY((void *) re_ctx->saved,
 570                                    (const void *) full_save,
 571                                    sizeof(duk_uint8_t *) * re_ctx->nsaved);
 572                         duk_pop((duk_context *) re_ctx->thr);
 573                         goto fail;
 574                 }
 575                 case DUK_REOP_BACKREFERENCE: {
 576                         /*
 577                          *  Byte matching for back-references would be OK in case-
 578                          *  sensitive matching.  In case-insensitive matching we need
 579                          *  to canonicalize characters, so back-reference matching needs
 580                          *  to be done with codepoints instead.  So, we just decode
 581                          *  everything normally here, too.
 582                          *
 583                          *  Note: back-reference index which is 0 or higher than
 584                          *  NCapturingParens (= number of capturing parens in the
 585                          *  -entire- regexp) is a compile time error.  However, a
 586                          *  backreference referring to a valid capture which has
 587                          *  not matched anything always succeeds!  See E5 Section
 588                          *  15.10.2.9, step 5, sub-step 3.
 589                          */
 590                         duk_uint32_t idx;
 591                         const duk_uint8_t *p;
 592
 593                         idx = duk__bc_get_u32(re_ctx, &pc);
 594                         idx = idx << 1;  /* backref n -> saved indices [n*2, n*2+1] */
 595                         if (idx < 2 || idx + 1 >= re_ctx->nsaved) {
 596                                 /* regexp compiler should catch these */
 597                                 DUK_D(DUK_DPRINT("internal error, backreference index insane"));
 598                                 goto internal_error;
 599                         }
 600                         if (!re_ctx->saved[idx] || !re_ctx->saved[idx+1]) {
 601                                 /* capture is 'undefined', always matches! */
 602                                 DUK_DDD(DUK_DDDPRINT("backreference: saved[%ld,%ld] not complete, always match",
 603                                                      (long) idx, (long) (idx + 1)));
 604                                 break;
 605                         }
 606                         DUK_DDD(DUK_DDDPRINT("backreference: match saved[%ld,%ld]", (long) idx, (long) (idx + 1)));
 607
 608                         p = re_ctx->saved[idx];
 609                         while (p < re_ctx->saved[idx+1]) {
 610                                 duk_codepoint_t c1, c2;
 611
 612                                 /* Note: not necessary to check p against re_ctx->input_end:
 613                                  * the memory access is checked by duk__inp_get_cp(), while
 614                                  * valid compiled regexps cannot write a saved[] entry
 615                                  * which points to outside the string.
 616                                  */
 617                                 if (sp >= re_ctx->input_end) {
 618                                         goto fail;
 619                                 }
 620                                 c1 = duk__inp_get_cp(re_ctx, &p);
 621                                 c2 = duk__inp_get_cp(re_ctx, &sp);
 622                                 if (c1 != c2) {
 623                                         goto fail;
 624                                 }
 625                         }
 626                         break;
 627                 }
 628                 default: {
 629                         DUK_D(DUK_DPRINT("internal error, regexp opcode error: %ld", (long) op));
 630                         goto internal_error;
 631                 }
 632                 }
 633         }
 634
 635  match:
 636         re_ctx->recursion_depth--;
 637         return sp;
 638
 639  fail:
 640         re_ctx->recursion_depth--;
 641         return NULL;
 642
 643  internal_error:
 644         DUK_ERROR(re_ctx->thr, DUK_ERR_INTERNAL_ERROR, DUK_STR_REGEXP_INTERNAL_ERROR);
 645         return NULL;  /* never here */
 646 }
 647
 648 /*
 649  *  Exposed matcher function which provides the semantics of RegExp.prototype.exec().
 650  *
 651  *  RegExp.prototype.test() has the same semantics as exec() but does not return the
 652  *  result object (which contains the matching string and capture groups).  Currently
 653  *  there is no separate test() helper, so a temporary result object is created and
 654  *  discarded if test() is needed.  This is intentional, to save code space.
 655  *
 656  *  Input stack:  [ ... re_obj input ]
 657  *  Output stack: [ ... result ]
 658  */
 659
 660 DUK_LOCAL void duk__regexp_match_helper(duk_hthread *thr, duk_small_int_t force_global) {
 661         duk_context *ctx = (duk_context *) thr;
 662         duk_re_matcher_ctx re_ctx;
 663         duk_hobject *h_regexp;
 664         duk_hstring *h_bytecode;
 665         duk_hstring *h_input;
 666         const duk_uint8_t *pc;
 667         const duk_uint8_t *sp;
 668         duk_small_int_t match = 0;
 669         duk_small_int_t global;
 670         duk_uint_fast32_t i;
 671         double d;
 672         duk_uint32_t char_offset;
 673
 674         DUK_ASSERT(thr != NULL);
 675         DUK_ASSERT(ctx != NULL);
 676
 677         DUK_DD(DUK_DDPRINT("regexp match: regexp=%!T, input=%!T",
 678                            (duk_tval *) duk_get_tval(ctx, -2),
 679                            (duk_tval *) duk_get_tval(ctx, -1)));
 680
 681         /*
 682          *  Regexp instance check, bytecode check, input coercion.
 683          *
 684          *  See E5 Section 15.10.6.
 685          */
 686
 687         /* TypeError if wrong; class check, see E5 Section 15.10.6 */
 688         h_regexp = duk_require_hobject_with_class(ctx, -2, DUK_HOBJECT_CLASS_REGEXP);
 689         DUK_ASSERT(h_regexp != NULL);
 690         DUK_ASSERT(DUK_HOBJECT_GET_CLASS_NUMBER(h_regexp) == DUK_HOBJECT_CLASS_REGEXP);
 691         DUK_UNREF(h_regexp);
 692
 693         duk_to_string(ctx, -1);
 694         h_input = duk_get_hstring(ctx, -1);
 695         DUK_ASSERT(h_input != NULL);
 696
 697         duk_get_prop_stridx(ctx, -2, DUK_STRIDX_INT_BYTECODE);  /* [ ... re_obj input ] -> [ ... re_obj input bc ] */
 698         h_bytecode = duk_require_hstring(ctx, -1);  /* no regexp instance should exist without a non-configurable bytecode property */
 699         DUK_ASSERT(h_bytecode != NULL);
 700
 701         /*
 702          *  Basic context initialization.
 703          *
 704          *  Some init values are read from the bytecode header
 705          *  whose format is (UTF-8 codepoints):
 706          *
 707          *    uint   flags
 708          *    uint   nsaved (even, 2n+2 where n = num captures)
 709          */
 710
 711         /* [ ... re_obj input bc ] */
 712
 713         DUK_MEMZERO(&re_ctx, sizeof(re_ctx));
 714
 715         re_ctx.thr = thr;
 716         re_ctx.input = (duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
 717         re_ctx.input_end = re_ctx.input + DUK_HSTRING_GET_BYTELEN(h_input);
 718         re_ctx.bytecode = (duk_uint8_t *) DUK_HSTRING_GET_DATA(h_bytecode);
 719         re_ctx.bytecode_end = re_ctx.bytecode + DUK_HSTRING_GET_BYTELEN(h_bytecode);
 720         re_ctx.saved = NULL;
 721         re_ctx.recursion_limit = DUK_USE_REGEXP_EXECUTOR_RECLIMIT;
 722         re_ctx.steps_limit = DUK_RE_EXECUTE_STEPS_LIMIT;
 723
 724         /* read header */
 725         pc = re_ctx.bytecode;
 726         re_ctx.re_flags = duk__bc_get_u32(&re_ctx, &pc);
 727         re_ctx.nsaved = duk__bc_get_u32(&re_ctx, &pc);
 728         re_ctx.bytecode = pc;
 729
 730         DUK_ASSERT(DUK_RE_FLAG_GLOBAL < 0x10000UL);  /* must fit into duk_small_int_t */
 731         global = (duk_small_int_t) (force_global | (re_ctx.re_flags & DUK_RE_FLAG_GLOBAL));
 732
 733         DUK_ASSERT(re_ctx.nsaved >= 2);
 734         DUK_ASSERT((re_ctx.nsaved % 2) == 0);
 735
 736         duk_push_fixed_buffer(ctx, sizeof(duk_uint8_t *) * re_ctx.nsaved);
 737         re_ctx.saved = (const duk_uint8_t **) duk_get_buffer(ctx, -1, NULL);
 738         DUK_ASSERT(re_ctx.saved != NULL);
 739
 740         /* [ ... re_obj input bc saved_buf ] */
 741
 742         /* buffer is automatically zeroed */
 743 #ifdef DUK_USE_EXPLICIT_NULL_INIT
 744         for (i = 0; i < re_ctx.nsaved; i++) {
 745                 re_ctx.saved[i] = (duk_uint8_t *) NULL;
 746         }
 747 #endif
 748
 749         DUK_DDD(DUK_DDDPRINT("regexp ctx initialized, flags=0x%08lx, nsaved=%ld, recursion_limit=%ld, steps_limit=%ld",
 750                              (unsigned long) re_ctx.re_flags, (long) re_ctx.nsaved, (long) re_ctx.recursion_limit,
 751                              (long) re_ctx.steps_limit));
 752
 753         /*
 754          *  Get starting character offset for match, and initialize 'sp' based on it.
 755          *
 756          *  Note: lastIndex is non-configurable so it must be present (we check the
 757          *  internal class of the object above, so we know it is).  User code can set
 758          *  its value to an arbitrary (garbage) value though; E5 requires that lastIndex
 759          *  be coerced to a number before using.  The code below works even if the
 760          *  property is missing: the value will then be coerced to zero.
 761          *
 762          *  Note: lastIndex may be outside Uint32 range even after ToInteger() coercion.
 763          *  For instance, ToInteger(+Infinity) = +Infinity.  We track the match offset
 764          *  as an integer, but pre-check it to be inside the 32-bit range before the loop.
 765          *  If not, the check in E5 Section 15.10.6.2, step 9.a applies.
 766          */
 767
 768         /* XXX: lastIndex handling produces a lot of asm */
 769
 770         /* [ ... re_obj input bc saved_buf ] */
 771
 772         duk_get_prop_stridx(ctx, -4, DUK_STRIDX_LAST_INDEX);  /* -> [ ... re_obj input bc saved_buf lastIndex ] */
 773         (void) duk_to_int(ctx, -1);  /* ToInteger(lastIndex) */
 774         d = duk_get_number(ctx, -1);  /* integer, but may be +/- Infinite, +/- zero (not NaN, though) */
 775         duk_pop(ctx);
 776
 777         if (global) {
 778                 if (d < 0.0 || d > (double) DUK_HSTRING_GET_CHARLEN(h_input)) {
 779                         /* match fail */
 780                         char_offset = 0;   /* not really necessary */
 781                         DUK_ASSERT(match == 0);
 782                         goto match_over;
 783                 }
 784                 char_offset = (duk_uint32_t) d;
 785         } else {
 786                 /* lastIndex must be ignored for non-global regexps, but get the
 787                  * value for (theoretical) side effects.  No side effects can
 788                  * really occur, because lastIndex is a normal property and is
 789                  * always non-configurable for RegExp instances.
 790                  */
 791                 char_offset = (duk_uint32_t) 0;
 792         }
 793
 794         sp = re_ctx.input + duk_heap_strcache_offset_char2byte(thr, h_input, char_offset);
 795
 796         /*
 797          *  Match loop.
 798          *
 799          *  Try matching at different offsets until match found or input exhausted.
 800          */
 801
 802         /* [ ... re_obj input bc saved_buf ] */
 803
 804         DUK_ASSERT(match == 0);
 805
 806         for (;;) {
 807                 /* char offset in [0, h_input->clen] (both ends inclusive), checked before entry */
 808                 DUK_ASSERT_DISABLE(char_offset >= 0);
 809                 DUK_ASSERT(char_offset <= DUK_HSTRING_GET_CHARLEN(h_input));
 810
 811                 /* Note: ctx.steps is intentionally not reset, it applies to the entire unanchored match */
 812                 DUK_ASSERT(re_ctx.recursion_depth == 0);
 813
 814                 DUK_DDD(DUK_DDDPRINT("attempt match at char offset %ld; %p [%p,%p]",
 815                                      (long) char_offset, (void *) sp, (void *) re_ctx.input,
 816                                      (void *) re_ctx.input_end));
 817
 818                 /*
 819                  *  Note:
 820                  *
 821                  *    - duk__match_regexp() is required not to longjmp() in ordinary "non-match"
 822                  *      conditions; a longjmp() will terminate the entire matching process.
 823                  *
 824                  *    - Clearing saved[] is not necessary because backtracking does it
 825                  *
 826                  *    - Backtracking also rewinds ctx.recursion back to zero, unless an
 827                  *      internal/limit error occurs (which causes a longjmp())
 828                  *
 829                  *    - If we supported anchored matches, we would break out here
 830                  *      unconditionally; however, Ecmascript regexps don't have anchored
 831                  *      matches.  It might make sense to implement a fast bail-out if
 832                  *      the regexp begins with '^' and sp is not 0: currently we'll just
 833                  *      run through the entire input string, trivially failing the match
 834                  *      at every non-zero offset.
 835                  */
 836
 837                 if (duk__match_regexp(&re_ctx, re_ctx.bytecode, sp) != NULL) {
 838                         DUK_DDD(DUK_DDDPRINT("match at offset %ld", (long) char_offset));
 839                         match = 1;
 840                         break;
 841                 }
 842
 843                 /* advance by one character (code point) and one char_offset */
 844                 char_offset++;
 845                 if (char_offset > DUK_HSTRING_GET_CHARLEN(h_input)) {
 846                         /*
 847                          *  Note:
 848                          *
 849                          *    - Intentionally attempt (empty) match at char_offset == k_input->clen
 850                          *
 851                          *    - Negative char_offsets have been eliminated and char_offset is duk_uint32_t
 852                          *      -> no need or use for a negative check
 853                          */
 854
 855                         DUK_DDD(DUK_DDDPRINT("no match after trying all sp offsets"));
 856                         break;
 857                 }
 858
 859                 /* avoid calling at end of input, will DUK_ERROR (above check suffices to avoid this) */
 860                 (void) duk__utf8_advance(thr, &sp, re_ctx.input, re_ctx.input_end, (duk_uint_fast32_t) 1);
 861         }
 862
 863  match_over:
 864
 865         /*
 866          *  Matching complete, create result array or return a 'null'.  Update lastIndex
 867          *  if necessary.  See E5 Section 15.10.6.2.
 868          *
 869          *  Because lastIndex is a character (not byte) offset, we need the character
 870          *  length of the match which we conveniently get as a side effect of interning
 871          *  the matching substring (0th index of result array).
 872          *
 873          *  saved[0]         start pointer (~ byte offset) of current match
 874          *  saved[1]         end pointer (~ byte offset) of current match (exclusive)
 875          *  char_offset      start character offset of current match (-> .index of result)
 876          *  char_end_offset  end character offset (computed below)
 877          */
 878
 879         /* [ ... re_obj input bc saved_buf ] */
 880
 881         if (match) {
 882 #ifdef DUK_USE_ASSERTIONS
 883                 duk_hobject *h_res;
 884 #endif
 885                 duk_uint32_t char_end_offset = 0;
 886
 887                 DUK_DDD(DUK_DDDPRINT("regexp matches at char_offset %ld", (long) char_offset));
 888
 889                 DUK_ASSERT(re_ctx.nsaved >= 2);        /* must have start and end */
 890                 DUK_ASSERT((re_ctx.nsaved % 2) == 0);  /* and even number */
 891
 892                 /* XXX: Array size is known before and (2 * re_ctx.nsaved) but not taken
 893                  * advantage of now.  The array is not compacted either, as regexp match
 894                  * objects are usually short lived.
 895                  */
 896
 897                 duk_push_array(ctx);
 898
 899 #ifdef DUK_USE_ASSERTIONS
 900                 h_res = duk_require_hobject(ctx, -1);
 901                 DUK_ASSERT(DUK_HOBJECT_HAS_EXTENSIBLE(h_res));
 902                 DUK_ASSERT(DUK_HOBJECT_HAS_EXOTIC_ARRAY(h_res));
 903                 DUK_ASSERT(DUK_HOBJECT_GET_CLASS_NUMBER(h_res) == DUK_HOBJECT_CLASS_ARRAY);
 904 #endif
 905
 906                 /* [ ... re_obj input bc saved_buf res_obj ] */
 907
 908                 duk_push_u32(ctx, char_offset);
 909                 duk_xdef_prop_stridx_wec(ctx, -2, DUK_STRIDX_INDEX);
 910
 911                 duk_dup(ctx, -4);
 912                 duk_xdef_prop_stridx_wec(ctx, -2, DUK_STRIDX_INPUT);
 913
 914                 for (i = 0; i < re_ctx.nsaved; i += 2) {
 915                         /* Captures which are undefined have NULL pointers and are returned
 916                          * as 'undefined'.  The same is done when saved[] pointers are insane
 917                          * (this should, of course, never happen in practice).
 918                          */
 919                         if (re_ctx.saved[i] && re_ctx.saved[i+1] && re_ctx.saved[i+1] >= re_ctx.saved[i]) {
 920                                 duk_hstring *h_saved;
 921
 922                                 duk_push_lstring(ctx,
 923                                                  (char *) re_ctx.saved[i],
 924                                                  (duk_size_t) (re_ctx.saved[i+1] - re_ctx.saved[i]));
 925                                 h_saved = duk_get_hstring(ctx, -1);
 926                                 DUK_ASSERT(h_saved != NULL);
 927
 928                                 if (i == 0) {
 929                                         /* Assumes that saved[0] and saved[1] are always
 930                                          * set by regexp bytecode (if not, char_end_offset
 931                                          * will be zero).  Also assumes clen reflects the
 932                                          * correct char length.
 933                                          */
 934                                         char_end_offset = char_offset + DUK_HSTRING_GET_CHARLEN(h_saved);
 935                                 }
 936                         } else {
 937                                 duk_push_undefined(ctx);
 938                         }
 939
 940                         /* [ ... re_obj input bc saved_buf res_obj val ] */
 941                         duk_put_prop_index(ctx, -2, i / 2);
 942                 }
 943
 944                 /* [ ... re_obj input bc saved_buf res_obj ] */
 945
 946                 /* NB: 'length' property is automatically updated by the array setup loop */
 947
 948                 if (global) {
 949                         /* global regexp: lastIndex updated on match */
 950                         duk_push_u32(ctx, char_end_offset);
 951                         duk_put_prop_stridx(ctx, -6, DUK_STRIDX_LAST_INDEX);
 952                 } else {
 953                         /* non-global regexp: lastIndex never updated on match */
 954                         ;
 955                 }
 956         } else {
 957                 /*
 958                  *  No match, E5 Section 15.10.6.2, step 9.a.i - 9.a.ii apply, regardless
 959                  *  of 'global' flag of the RegExp.  In particular, if lastIndex is invalid
 960                  *  initially, it is reset to zero.
 961                  */
 962
 963                 DUK_DDD(DUK_DDDPRINT("regexp does not match"));
 964
 965                 duk_push_null(ctx);
 966
 967                 /* [ ... re_obj input bc saved_buf res_obj ] */
 968
 969                 duk_push_int(ctx, 0);
 970                 duk_put_prop_stridx(ctx, -6, DUK_STRIDX_LAST_INDEX);
 971         }
 972
 973         /* [ ... re_obj input bc saved_buf res_obj ] */
 974
 975         duk_insert(ctx, -5);
 976
 977         /* [ ... res_obj re_obj input bc saved_buf ] */
 978
 979         duk_pop_n(ctx, 4);
 980
 981         /* [ ... res_obj ] */
 982
 983         /* XXX: these last tricks are unnecessary if the function is made
 984          * a genuine native function.
 985          */
 986 }
 987
 988 DUK_INTERNAL void duk_regexp_match(duk_hthread *thr) {
 989         duk__regexp_match_helper(thr, 0 /*force_global*/);
 990 }
 991
 992 /* This variant is needed by String.prototype.split(); it needs to perform
 993  * global-style matching on a cloned RegExp which is potentially non-global.
 994  */
 995 DUK_INTERNAL void duk_regexp_match_force_global(duk_hthread *thr) {
 996         duk__regexp_match_helper(thr, 1 /*force_global*/);
 997 }
 998
 999 #else  /* DUK_USE_REGEXP_SUPPORT */
1000
1001 /* regexp support disabled */
1002
1003 #endif  /* DUK_USE_REGEXP_SUPPORT */