1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 *
3 * ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * The Original Code is Mozilla Communicator client code, released
17 * March 31, 1998.
18 *
19 * The Initial Developer of the Original Code is
20 * Netscape Communications Corporation.
21 * Portions created by the Initial Developer are Copyright (C) 1998
22 * the Initial Developer. All Rights Reserved.
23 *
24 * Contributor(s):
25 *
26 * Alternatively, the contents of this file may be used under the terms of
27 * either of the GNU General Public License Version 2 or later (the "GPL"),
28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
37 *
38 * ***** END LICENSE BLOCK ***** */
40 /*
41 * JS regular expressions, after Perl.
42 */
43 #include "jsstddef.h"
44 #include <stdlib.h>
45 #include <string.h>
46 #include "jstypes.h"
47 #include "jsarena.h" /* Added by JSIFY */
48 #include "jsutil.h" /* Added by JSIFY */
49 #include "jsapi.h"
50 #include "jsarray.h"
51 #include "jsatom.h"
52 #include "jscntxt.h"
53 #include "jsconfig.h"
54 #include "jsfun.h"
55 #include "jsgc.h"
56 #include "jsinterp.h"
57 #include "jslock.h"
58 #include "jsnum.h"
59 #include "jsobj.h"
60 #include "jsopcode.h"
61 #include "jsregexp.h"
62 #include "jsscan.h"
63 #include "jsstr.h"
65 #ifdef XP_MAC
66 #include <MacMemory.h>
67 #endif
69 #if JS_HAS_REGEXPS
71 /* Note : contiguity of 'simple opcodes' is important for SimpleMatch() */
72 typedef enum REOp {
73 REOP_EMPTY = 0, /* match rest of input against rest of r.e. */
74 REOP_ALT = 1, /* alternative subexpressions in kid and next */
75 REOP_SIMPLE_START = 2, /* start of 'simple opcodes' */
76 REOP_BOL = 2, /* beginning of input (or line if multiline) */
77 REOP_EOL = 3, /* end of input (or line if multiline) */
78 REOP_WBDRY = 4, /* match "" at word boundary */
79 REOP_WNONBDRY = 5, /* match "" at word non-boundary */
80 REOP_DOT = 6, /* stands for any character */
81 REOP_DIGIT = 7, /* match a digit char: [0-9] */
82 REOP_NONDIGIT = 8, /* match a non-digit char: [^0-9] */
83 REOP_ALNUM = 9, /* match an alphanumeric char: [0-9a-z_A-Z] */
84 REOP_NONALNUM = 10, /* match a non-alphanumeric char: [^0-9a-z_A-Z] */
85 REOP_SPACE = 11, /* match a whitespace char */
86 REOP_NONSPACE = 12, /* match a non-whitespace char */
87 REOP_BACKREF = 13, /* back-reference (e.g., \1) to a parenthetical */
88 REOP_FLAT = 14, /* match a flat string */
89 REOP_FLAT1 = 15, /* match a single char */
90 REOP_FLATi = 16, /* case-independent REOP_FLAT */
91 REOP_FLAT1i = 17, /* case-independent REOP_FLAT1 */
92 REOP_UCFLAT1 = 18, /* single Unicode char */
93 REOP_UCFLAT1i = 19, /* case-independent REOP_UCFLAT1 */
94 REOP_UCFLAT = 20, /* flat Unicode string; len immediate counts chars */
95 REOP_UCFLATi = 21, /* case-independent REOP_UCFLAT */
96 REOP_CLASS = 22, /* character class with index */
97 REOP_NCLASS = 23, /* negated character class with index */
98 REOP_SIMPLE_END = 23, /* end of 'simple opcodes' */
99 REOP_QUANT = 25, /* quantified atom: atom{1,2} */
100 REOP_STAR = 26, /* zero or more occurrences of kid */
101 REOP_PLUS = 27, /* one or more occurrences of kid */
102 REOP_OPT = 28, /* optional subexpression in kid */
103 REOP_LPAREN = 29, /* left paren bytecode: kid is u.num'th sub-regexp */
104 REOP_RPAREN = 30, /* right paren bytecode */
105 REOP_JUMP = 31, /* for deoptimized closure loops */
106 REOP_DOTSTAR = 32, /* optimize .* to use a single opcode */
107 REOP_ANCHOR = 33, /* like .* but skips left context to unanchored r.e. */
108 REOP_EOLONLY = 34, /* $ not preceded by any pattern */
109 REOP_BACKREFi = 37, /* case-independent REOP_BACKREF */
110 REOP_LPARENNON = 41, /* non-capturing version of REOP_LPAREN */
111 REOP_ASSERT = 43, /* zero width positive lookahead assertion */
112 REOP_ASSERT_NOT = 44, /* zero width negative lookahead assertion */
113 REOP_ASSERTTEST = 45, /* sentinel at end of assertion child */
114 REOP_ASSERTNOTTEST = 46, /* sentinel at end of !assertion child */
115 REOP_MINIMALSTAR = 47, /* non-greedy version of * */
116 REOP_MINIMALPLUS = 48, /* non-greedy version of + */
117 REOP_MINIMALOPT = 49, /* non-greedy version of ? */
118 REOP_MINIMALQUANT = 50, /* non-greedy version of {} */
119 REOP_ENDCHILD = 51, /* sentinel at end of quantifier child */
120 REOP_REPEAT = 52, /* directs execution of greedy quantifier */
121 REOP_MINIMALREPEAT = 53, /* directs execution of non-greedy quantifier */
122 REOP_ALTPREREQ = 54, /* prerequisite for ALT, either of two chars */
123 REOP_ALTPREREQ2 = 55, /* prerequisite for ALT, a char or a class */
124 REOP_ENDALT = 56, /* end of final alternate */
125 REOP_CONCAT = 57, /* concatenation of terms (parse time only) */
127 REOP_END
128 } REOp;
130 #define REOP_IS_SIMPLE(op) ((unsigned)((op) - REOP_SIMPLE_START) < \
131 (unsigned)REOP_SIMPLE_END)
133 struct RENode {
134 REOp op; /* r.e. op bytecode */
135 RENode *next; /* next in concatenation order */
136 void *kid; /* first operand */
137 union {
138 void *kid2; /* second operand */
139 jsint num; /* could be a number */
140 uint16 parenIndex; /* or a parenthesis index */
141 struct { /* or a quantifier range */
142 uint16 min;
143 uint16 max;
144 JSBool greedy;
145 } range;
146 struct { /* or a character class */
147 uint16 startIndex;
148 uint16 kidlen; /* length of string at kid, in jschars */
149 uint16 bmsize; /* bitmap size, based on max char code */
150 uint16 index; /* index into class list */
151 JSBool sense;
152 } ucclass;
153 struct { /* or a literal sequence */
154 jschar chr; /* of one character */
155 uint16 length; /* or many (via the kid) */
156 } flat;
157 struct {
158 RENode *kid2; /* second operand from ALT */
159 jschar ch1; /* match char for ALTPREREQ */
160 jschar ch2; /* ditto, or class index for ALTPREREQ2 */
161 } altprereq;
162 } u;
163 };
165 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
166 ((c >= 'a') && (c <= 'z')) )
167 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
168 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
170 #define CLASS_CACHE_SIZE 4
172 typedef struct CompilerState {
173 JSContext *context;
174 JSTokenStream *tokenStream; /* For reporting errors */
175 const jschar *cpbegin;
176 const jschar *cpend;
177 const jschar *cp;
178 uint16 flags;
179 uint16 parenCount;
180 uint16 classCount; /* number of [] encountered */
181 uint16 treeDepth; /* maximum depth of parse tree */
182 size_t progLength; /* estimated bytecode length */
183 RENode *result;
184 struct {
185 const jschar *start; /* small cache of class strings */
186 uint16 length; /* since they're often the same */
187 uint16 index;
188 } classCache[CLASS_CACHE_SIZE];
189 } CompilerState;
191 typedef struct RECapture {
192 int32 index; /* start of contents, -1 for empty */
193 uint16 length; /* length of capture */
194 } RECapture;
196 typedef struct REMatchState {
197 const jschar *cp;
198 RECapture parens[1]; /* first of 're->parenCount' captures,
199 * allocated at end of this struct.
200 */
201 } REMatchState;
203 struct REBackTrackData;
205 typedef struct REProgState {
206 jsbytecode *continue_pc; /* current continuation data */
207 jsbytecode continue_op;
208 uint16 index; /* progress in text */
209 uintN parenSoFar; /* highest indexed paren started */
210 union {
211 struct {
212 uint16 min; /* current quantifier limits */
213 uint16 max;
214 } quantifier;
215 struct {
216 size_t top; /* backtrack stack state */
217 size_t sz;
218 } assertion;
219 } u;
220 } REProgState;
222 typedef struct REBackTrackData {
223 size_t sz; /* size of previous stack entry */
224 jsbytecode *backtrack_pc; /* where to backtrack to */
225 jsbytecode backtrack_op;
226 const jschar *cp; /* index in text of match at backtrack */
227 uint16 parenIndex; /* start index of saved paren contents */
228 uint16 parenCount; /* # of saved paren contents */
229 uint16 saveStateStackTop; /* number of parent states */
230 /* saved parent states follow */
231 /* saved paren contents follow */
232 } REBackTrackData;
234 #define INITIAL_STATESTACK (100)
235 #define INITIAL_BACKTRACK (8000)
237 typedef struct REGlobalData {
238 JSContext *cx;
239 JSRegExp *regexp; /* the RE in execution */
240 JSBool ok; /* runtime error (out_of_memory only?) */
241 size_t start; /* offset to start at */
242 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
243 const jschar *cpbegin, *cpend; /* text base address and limit */
245 REProgState *stateStack; /* stack of state of current parents */
246 uint16 stateStackTop;
247 uint16 stateStackLimit;
249 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
250 REBackTrackData *backTrackSP;
251 size_t backTrackStackSize;
252 size_t cursz; /* size of current stack entry */
254 JSArenaPool pool; /* I don't understand but it's faster to
255 * use this than to malloc/free the three
256 * items that are allocated from this pool
257 */
259 } REGlobalData;
262 /*
263 * 1. If IgnoreCase is false, return ch.
264 * 2. Let u be ch converted to upper case as if by calling
265 * String.prototype.toUpperCase on the one-character string ch.
266 * 3. If u does not consist of a single character, return ch.
267 * 4. Let cu be u's character.
268 * 5. If ch's code point value is greater than or equal to decimal 128 and cu's
269 * code point value is less than decimal 128, then return ch.
270 * 6. Return cu.
271 */
272 static jschar
273 upcase(jschar ch)
274 {
275 jschar cu = JS_TOUPPER(ch);
276 if (ch >= 128 && cu < 128)
277 return ch;
278 return cu;
279 }
281 static jschar
282 downcase(jschar ch)
283 {
284 jschar cl = JS_TOLOWER(ch);
285 if (cl >= 128 && ch < 128)
286 return ch;
287 return cl;
288 }
290 /* Construct and initialize an RENode, returning NULL for out-of-memory */
291 static RENode *
292 NewRENode(CompilerState *state, REOp op)
293 {
294 JSContext *cx;
295 RENode *ren;
297 cx = state->context;
298 JS_ARENA_ALLOCATE_CAST(ren, RENode *, &cx->tempPool, sizeof *ren);
299 if (!ren) {
300 JS_ReportOutOfMemory(cx);
301 return NULL;
302 }
303 ren->op = op;
304 ren->next = NULL;
305 ren->kid = NULL;
306 return ren;
307 }
309 /*
310 * Validates and converts hex ascii value.
311 */
312 static JSBool
313 isASCIIHexDigit(jschar c, uintN *digit)
314 {
315 uintN cv = c;
317 if (cv < '0')
318 return JS_FALSE;
319 if (cv <= '9') {
320 *digit = cv - '0';
321 return JS_TRUE;
322 }
323 cv |= 0x20;
324 if (cv >= 'a' && cv <= 'f') {
325 *digit = cv - 'a' + 10;
326 return JS_TRUE;
327 }
328 return JS_FALSE;
329 }
332 typedef struct {
333 REOp op;
334 const jschar *errPos;
335 uint16 parenIndex;
336 } REOpData;
339 /*
340 * Process the op against the two top operands, reducing them to a single
341 * operand in the penultimate slot. Update progLength and treeDepth.
342 */
343 static JSBool
344 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack, intN operandSP)
345 {
346 RENode *result;
348 switch (opData->op) {
349 case REOP_ALT:
350 result = NewRENode(state, REOP_ALT);
351 if (!result)
352 return JS_FALSE;
353 result->kid = operandStack[operandSP - 2];
354 result->u.kid2 = operandStack[operandSP - 1];
355 operandStack[operandSP - 2] = result;
356 /*
357 * look at both alternates to see if there's a FLAT or a CLASS at
358 * the start of each. If so, use a prerequisite match
359 */
360 ++state->treeDepth;
361 if (((RENode *) result->kid)->op == REOP_FLAT &&
362 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
363 (state->flags & JSREG_FOLD) == 0) {
364 result->op = REOP_ALTPREREQ;
365 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
366 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
367 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
368 JUMP, <end> ... ENDALT */
369 state->progLength += 13;
370 }
371 else
372 if (((RENode *) result->kid)->op == REOP_CLASS &&
373 ((RENode *) result->kid)->u.ucclass.index < 256 &&
374 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
375 (state->flags & JSREG_FOLD) == 0) {
376 result->op = REOP_ALTPREREQ2;
377 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
378 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
379 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
380 JUMP, <end> ... ENDALT */
381 state->progLength += 13;
382 }
383 else
384 if (((RENode *) result->kid)->op == REOP_FLAT &&
385 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
386 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
387 (state->flags & JSREG_FOLD) == 0) {
388 result->op = REOP_ALTPREREQ2;
389 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
390 result->u.altprereq.ch2 =
391 ((RENode *) result->u.kid2)->u.ucclass.index;
392 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
393 JUMP, <end> ... ENDALT */
394 state->progLength += 13;
395 }
396 else
397 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
398 state->progLength += 7;
399 break;
400 case REOP_CONCAT:
401 result = operandStack[operandSP - 2];
402 while (result->next)
403 result = result->next;
404 result->next = operandStack[operandSP - 1];
405 break;
406 case REOP_ASSERT:
407 case REOP_ASSERT_NOT:
408 case REOP_LPARENNON:
409 case REOP_LPAREN:
410 /* These should have been processed by a close paren. */
411 js_ReportCompileErrorNumber(state->context, state->tokenStream,
412 NULL, JSREPORT_ERROR,
413 JSMSG_MISSING_PAREN, opData->errPos);
414 return JS_FALSE;
415 default:;
416 }
417 return JS_TRUE;
418 }
420 /*
421 * Parser forward declarations.
422 */
423 static JSBool ParseTerm(CompilerState *state);
424 static JSBool ParseQuantifier(CompilerState *state);
426 /*
427 * Top-down regular expression grammar, based closely on Perl4.
428 *
429 * regexp: altern A regular expression is one or more
430 * altern '|' regexp alternatives separated by vertical bar.
431 */
432 #define INITIAL_STACK_SIZE 128
434 static JSBool
435 ParseRegExp(CompilerState *state)
436 {
437 uint16 parenIndex;
438 RENode *operand;
439 REOpData *operatorStack;
440 RENode **operandStack;
441 REOp op;
442 intN i;
443 JSBool result = JS_FALSE;
445 intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
446 intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
448 /* Watch out for empty regexp */
449 if (state->cp == state->cpend) {
450 state->result = NewRENode(state, REOP_EMPTY);
451 return (state->result != NULL);
452 }
454 operatorStack = (REOpData *)
455 JS_malloc(state->context, sizeof(REOpData) * operatorStackSize);
456 if (!operatorStack)
457 return JS_FALSE;
459 operandStack = (RENode **)
460 JS_malloc(state->context, sizeof(RENode *) * operandStackSize);
461 if (!operandStack)
462 goto out;
464 while (JS_TRUE) {
465 parenIndex = state->parenCount;
466 if (state->cp == state->cpend) {
467 /*
468 * If we are at the end of the regexp and we're short one or more
469 * operands, the regexp must have the form /x|/ or some such, with
470 * left parentheses making us short more than one operand.
471 */
472 if (operatorSP >= operandSP) {
473 operand = NewRENode(state, REOP_EMPTY);
474 if (!operand)
475 goto out;
476 goto pushOperand;
477 }
478 } else {
479 switch (*state->cp) {
480 /* balance '(' */
481 case '(': /* balance ')' */
482 ++state->cp;
483 if (state->cp + 1 < state->cpend &&
484 *state->cp == '?' &&
485 (state->cp[1] == '=' ||
486 state->cp[1] == '!' ||
487 state->cp[1] == ':')) {
488 switch (state->cp[1]) {
489 case '=':
490 op = REOP_ASSERT;
491 /* ASSERT, <next>, ... ASSERTTEST */
492 state->progLength += 4;
493 break;
494 case '!':
495 op = REOP_ASSERT_NOT;
496 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
497 state->progLength += 4;
498 break;
499 default:
500 op = REOP_LPARENNON;
501 break;
502 }
503 state->cp += 2;
504 } else {
505 op = REOP_LPAREN;
506 /* LPAREN, <index>, ... RPAREN, <index> */
507 state->progLength += 6;
508 state->parenCount++;
509 if (state->parenCount == 65535) {
510 js_ReportCompileErrorNumber(state->context,
511 state->tokenStream,
512 NULL, JSREPORT_ERROR,
513 JSMSG_TOO_MANY_PARENS);
514 goto out;
515 }
516 }
517 goto pushOperator;
518 case ')':
519 /* If there's not a stacked open parenthesis, throw
520 * a syntax error.
521 */
522 for (i = operatorSP - 1; ; i--) {
523 if (i < 0) {
524 js_ReportCompileErrorNumber(state->context,
525 state->tokenStream,
526 NULL, JSREPORT_ERROR,
527 JSMSG_UNMATCHED_RIGHT_PAREN);
528 goto out;
529 }
530 if (operatorStack[i].op == REOP_ASSERT ||
531 operatorStack[i].op == REOP_ASSERT_NOT ||
532 operatorStack[i].op == REOP_LPARENNON ||
533 operatorStack[i].op == REOP_LPAREN) {
534 break;
535 }
536 }
537 /* fall thru... */
538 case '|':
539 /* Expected an operand before these, so make an empty one */
540 operand = NewRENode(state, REOP_EMPTY);
541 if (!operand)
542 goto out;
543 goto pushOperand;
544 default:
545 if (!ParseTerm(state))
546 goto out;
547 operand = state->result;
548 pushOperand:
549 if (operandSP == operandStackSize) {
550 operandStackSize += operandStackSize;
551 operandStack =
552 (RENode **)JS_realloc(state->context, operandStack,
553 sizeof(RENode *) * operandStackSize);
554 if (!operandStack)
555 goto out;
556 }
557 operandStack[operandSP++] = operand;
558 break;
559 }
560 }
561 /* At the end; process remaining operators */
562 restartOperator:
563 if (state->cp == state->cpend) {
564 while (operatorSP) {
565 --operatorSP;
566 if (!ProcessOp(state, &operatorStack[operatorSP],
567 operandStack, operandSP))
568 goto out;
569 --operandSP;
570 }
571 JS_ASSERT(operandSP == 1);
572 state->result = operandStack[0];
573 result = JS_TRUE;
574 goto out;
575 }
576 switch (*state->cp) {
577 case '|':
578 /* Process any stacked 'concat' operators */
579 ++state->cp;
580 while (operatorSP &&
581 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
582 --operatorSP;
583 if (!ProcessOp(state, &operatorStack[operatorSP],
584 operandStack, operandSP))
585 goto out;
586 --operandSP;
587 }
588 op = REOP_ALT;
589 goto pushOperator;
591 case ')':
592 /* If there's not a stacked open parenthesis,we
593 * accept the close as a flat.
594 */
595 for (i = operatorSP - 1; ; i--) {
596 if (i < 0) {
597 js_ReportCompileErrorNumber(state->context,
598 state->tokenStream,
599 NULL, JSREPORT_ERROR,
600 JSMSG_UNMATCHED_RIGHT_PAREN);
601 goto out;
602 }
603 if (operatorStack[i].op == REOP_ASSERT ||
604 operatorStack[i].op == REOP_ASSERT_NOT ||
605 operatorStack[i].op == REOP_LPARENNON ||
606 operatorStack[i].op == REOP_LPAREN) {
607 break;
608 }
609 }
610 ++state->cp;
611 /* process everything on the stack until the open */
612 while (JS_TRUE) {
613 JS_ASSERT(operatorSP);
614 --operatorSP;
615 switch (operatorStack[operatorSP].op) {
616 case REOP_ASSERT:
617 case REOP_ASSERT_NOT:
618 case REOP_LPAREN:
619 operand = NewRENode(state, operatorStack[operatorSP].op);
620 if (!operand)
621 goto out;
622 operand->u.parenIndex =
623 operatorStack[operatorSP].parenIndex;
624 JS_ASSERT(operandSP);
625 operand->kid = operandStack[operandSP - 1];
626 operandStack[operandSP - 1] = operand;
627 ++state->treeDepth;
628 /* fall thru... */
629 case REOP_LPARENNON:
630 state->result = operandStack[operandSP - 1];
631 if (!ParseQuantifier(state))
632 goto out;
633 operandStack[operandSP - 1] = state->result;
634 goto restartOperator;
635 default:
636 if (!ProcessOp(state, &operatorStack[operatorSP],
637 operandStack, operandSP))
638 goto out;
639 --operandSP;
640 break;
641 }
642 }
643 break;
644 case '+':
645 case '*':
646 case '?':
647 case '{':
648 js_ReportCompileErrorNumber(state->context, state->tokenStream,
649 NULL, JSREPORT_ERROR,
650 JSMSG_BAD_QUANTIFIER, state->cp);
651 result = JS_FALSE;
652 goto out;
653 default:
654 /* Anything else is the start of the next term */
655 op = REOP_CONCAT;
656 pushOperator:
657 if (operatorSP == operatorStackSize) {
658 operatorStackSize += operatorStackSize;
659 operatorStack =
660 (REOpData *)JS_realloc(state->context, operatorStack,
661 sizeof(REOpData) * operatorStackSize);
662 if (!operatorStack)
663 goto out;
664 }
665 operatorStack[operatorSP].op = op;
666 operatorStack[operatorSP].errPos = state->cp;
667 operatorStack[operatorSP++].parenIndex = parenIndex;
668 break;
669 }
670 }
671 out:
672 if (operatorStack)
673 JS_free(state->context, operatorStack);
674 if (operandStack)
675 JS_free(state->context, operandStack);
676 return result;
677 }
679 /*
680 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
681 * its being on the stack, and to propagate errors to its callers.
682 */
683 #define JSREG_FIND_PAREN_COUNT 0x8000
684 #define JSREG_FIND_PAREN_ERROR 0x4000
686 /*
687 * Magic return value from FindParenCount and GetDecimalValue, to indicate
688 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
689 * its findMax parameter is non-null.
690 */
691 #define OVERFLOW_VALUE ((uintN)-1)
693 static uintN
694 FindParenCount(CompilerState *state)
695 {
696 CompilerState temp;
697 int i;
699 if (state->flags & JSREG_FIND_PAREN_COUNT)
700 return OVERFLOW_VALUE;
702 /*
703 * Copy state into temp, flag it so we never report an invalid backref,
704 * and reset its members to parse the entire regexp. This is obviously
705 * suboptimal, but GetDecimalValue calls us only if a backref appears to
706 * refer to a forward parenthetical, which is rare.
707 */
708 temp = *state;
709 temp.flags |= JSREG_FIND_PAREN_COUNT;
710 temp.cp = temp.cpbegin;
711 temp.parenCount = 0;
712 temp.classCount = 0;
713 temp.progLength = 0;
714 temp.treeDepth = 0;
715 for (i = 0; i < CLASS_CACHE_SIZE; i++)
716 temp.classCache[i].start = NULL;
718 if (!ParseRegExp(&temp)) {
719 state->flags |= JSREG_FIND_PAREN_ERROR;
720 return OVERFLOW_VALUE;
721 }
722 return temp.parenCount;
723 }
725 /*
726 * Extract and return a decimal value at state->cp. The initial character c
727 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
728 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
729 * state->flags to discover whether an error occurred under findMax.
730 */
731 static uintN
732 GetDecimalValue(jschar c, uintN max, uintN (*findMax)(CompilerState *state),
733 CompilerState *state)
734 {
735 uintN value = JS7_UNDEC(c);
736 JSBool overflow = (value > max && (!findMax || value > findMax(state)));
738 /* The following restriction allows simpler overflow checks. */
739 JS_ASSERT(max <= ((uintN)-1 - 9) / 10);
740 while (state->cp < state->cpend) {
741 c = *state->cp;
742 if (!JS7_ISDEC(c))
743 break;
744 value = 10 * value + JS7_UNDEC(c);
745 if (!overflow && value > max && (!findMax || value > findMax(state)))
746 overflow = JS_TRUE;
747 ++state->cp;
748 }
749 return overflow ? OVERFLOW_VALUE : value;
750 }
752 /*
753 * Calculate the total size of the bitmap required for a class expression.
754 */
755 static JSBool
756 CalculateBitmapSize(CompilerState *state, RENode *target, const jschar *src,
757 const jschar *end)
758 {
759 uintN max = 0;
760 JSBool inRange = JS_FALSE;
761 jschar c, rangeStart = 0;
762 uintN n, digit, nDigits, i;
764 target->u.ucclass.bmsize = 0;
765 target->u.ucclass.sense = JS_TRUE;
767 if (src == end)
768 return JS_TRUE;
770 if (*src == '^') {
771 ++src;
772 target->u.ucclass.sense = JS_FALSE;
773 }
775 while (src != end) {
776 uintN localMax = 0;
777 switch (*src) {
778 case '\\':
779 ++src;
780 c = *src++;
781 switch (c) {
782 case 'b':
783 localMax = 0x8;
784 break;
785 case 'f':
786 localMax = 0xC;
787 break;
788 case 'n':
789 localMax = 0xA;
790 break;
791 case 'r':
792 localMax = 0xD;
793 break;
794 case 't':
795 localMax = 0x9;
796 break;
797 case 'v':
798 localMax = 0xB;
799 break;
800 case 'c':
801 if (src + 1 < end && RE_IS_LETTER(src[1]))
802 localMax = (jschar) (*src++ & 0x1F);
803 else
804 localMax = '\\';
805 break;
806 case 'x':
807 nDigits = 2;
808 goto lexHex;
809 case 'u':
810 nDigits = 4;
811 lexHex:
812 n = 0;
813 for (i = 0; (i < nDigits) && (src < end); i++) {
814 c = *src++;
815 if (!isASCIIHexDigit(c, &digit)) {
816 /*
817 * Back off to accepting the original
818 *'\' as a literal.
819 */
820 src -= i + 1;
821 n = '\\';
822 break;
823 }
824 n = (n << 4) | digit;
825 }
826 localMax = n;
827 break;
828 case 'd':
829 if (inRange) {
830 JS_ReportErrorNumber(state->context,
831 js_GetErrorMessage, NULL,
832 JSMSG_BAD_CLASS_RANGE);
833 return JS_FALSE;
834 }
835 localMax = '9';
836 break;
837 case 'D':
838 case 's':
839 case 'S':
840 case 'w':
841 case 'W':
842 if (inRange) {
843 JS_ReportErrorNumber(state->context,
844 js_GetErrorMessage, NULL,
845 JSMSG_BAD_CLASS_RANGE);
846 return JS_FALSE;
847 }
848 target->u.ucclass.bmsize = 65535;
849 return JS_TRUE;
850 case '0':
851 case '1':
852 case '2':
853 case '3':
854 case '4':
855 case '5':
856 case '6':
857 case '7':
858 /*
859 * This is a non-ECMA extension - decimal escapes (in this
860 * case, octal!) are supposed to be an error inside class
861 * ranges, but supported here for backwards compatibility.
862 *
863 */
864 n = JS7_UNDEC(c);
865 c = *src;
866 if ('0' <= c && c <= '7') {
867 src++;
868 n = 8 * n + JS7_UNDEC(c);
869 c = *src;
870 if ('0' <= c && c <= '7') {
871 src++;
872 i = 8 * n + JS7_UNDEC(c);
873 if (i <= 0377)
874 n = i;
875 else
876 src--;
877 }
878 }
879 localMax = n;
880 break;
882 default:
883 localMax = c;
884 break;
885 }
886 break;
887 default:
888 localMax = *src++;
889 break;
890 }
891 if (inRange) {
892 if (rangeStart > localMax) {
893 JS_ReportErrorNumber(state->context,
894 js_GetErrorMessage, NULL,
895 JSMSG_BAD_CLASS_RANGE);
896 return JS_FALSE;
897 }
898 inRange = JS_FALSE;
899 } else {
900 if (src < end - 1) {
901 if (*src == '-') {
902 ++src;
903 inRange = JS_TRUE;
904 rangeStart = (jschar)localMax;
905 continue;
906 }
907 }
908 }
909 if (state->flags & JSREG_FOLD) {
910 c = JS_MAX(upcase((jschar)localMax), downcase((jschar)localMax));
911 if (c > localMax)
912 localMax = c;
913 }
914 if (localMax > max)
915 max = localMax;
916 }
917 target->u.ucclass.bmsize = max;
918 return JS_TRUE;
919 }
921 /*
922 * item: assertion An item is either an assertion or
923 * quantatom a quantified atom.
924 *
925 * assertion: '^' Assertions match beginning of string
926 * (or line if the class static property
927 * RegExp.multiline is true).
928 * '$' End of string (or line if the class
929 * static property RegExp.multiline is
930 * true).
931 * '\b' Word boundary (between \w and \W).
932 * '\B' Word non-boundary.
933 *
934 * quantatom: atom An unquantified atom.
935 * quantatom '{' n ',' m '}'
936 * Atom must occur between n and m times.
937 * quantatom '{' n ',' '}' Atom must occur at least n times.
938 * quantatom '{' n '}' Atom must occur exactly n times.
939 * quantatom '*' Zero or more times (same as {0,}).
940 * quantatom '+' One or more times (same as {1,}).
941 * quantatom '?' Zero or one time (same as {0,1}).
942 *
943 * any of which can be optionally followed by '?' for ungreedy
944 *
945 * atom: '(' regexp ')' A parenthesized regexp (what matched
946 * can be addressed using a backreference,
947 * see '\' n below).
948 * '.' Matches any char except '\n'.
949 * '[' classlist ']' A character class.
950 * '[' '^' classlist ']' A negated character class.
951 * '\f' Form Feed.
952 * '\n' Newline (Line Feed).
953 * '\r' Carriage Return.
954 * '\t' Horizontal Tab.
955 * '\v' Vertical Tab.
956 * '\d' A digit (same as [0-9]).
957 * '\D' A non-digit.
958 * '\w' A word character, [0-9a-z_A-Z].
959 * '\W' A non-word character.
960 * '\s' A whitespace character, [ \b\f\n\r\t\v].
961 * '\S' A non-whitespace character.
962 * '\' n A backreference to the nth (n decimal
963 * and positive) parenthesized expression.
964 * '\' octal An octal escape sequence (octal must be
965 * two or three digits long, unless it is
966 * 0 for the null character).
967 * '\x' hex A hex escape (hex must be two digits).
968 * '\u' unicode A unicode escape (must be four digits).
969 * '\c' ctrl A control character, ctrl is a letter.
970 * '\' literalatomchar Any character except one of the above
971 * that follow '\' in an atom.
972 * otheratomchar Any character not first among the other
973 * atom right-hand sides.
974 */
975 static JSBool
976 ParseTerm(CompilerState *state)
977 {
978 jschar c = *state->cp++;
979 uintN nDigits;
980 uintN num, tmp, n, i;
981 const jschar *termStart;
983 switch (c) {
984 /* assertions and atoms */
985 case '^':
986 state->result = NewRENode(state, REOP_BOL);
987 if (!state->result)
988 return JS_FALSE;
989 state->progLength++;
990 return JS_TRUE;
991 case '$':
992 state->result = NewRENode(state, REOP_EOL);
993 if (!state->result)
994 return JS_FALSE;
995 state->progLength++;
996 return JS_TRUE;
997 case '\\':
998 if (state->cp >= state->cpend) {
999 /* a trailing '\' is an error */
1000 js_ReportCompileErrorNumber(state->context, state->tokenStream,
1001 NULL, JSREPORT_ERROR,
1002 JSMSG_TRAILING_SLASH);
1003 return JS_FALSE;
1004 }
1005 c = *state->cp++;
1006 switch (c) {
1007 /* assertion escapes */
1008 case 'b' :
1009 state->result = NewRENode(state, REOP_WBDRY);
1010 if (!state->result)
1011 return JS_FALSE;
1012 state->progLength++;
1013 return JS_TRUE;
1014 case 'B':
1015 state->result = NewRENode(state, REOP_WNONBDRY);
1016 if (!state->result)
1017 return JS_FALSE;
1018 state->progLength++;
1019 return JS_TRUE;
1020 /* Decimal escape */
1021 case '0':
1022 if (JS_HAS_STRICT_OPTION(state->context)) {
1023 if (!js_ReportCompileErrorNumber(state->context,
1024 state->tokenStream,
1025 NULL,
1026 JSREPORT_WARNING |
1027 JSREPORT_STRICT,
1028 JSMSG_INVALID_BACKREF)) {
1029 return JS_FALSE;
1030 }
1031 c = 0;
1032 } else {
1033 doOctal:
1034 num = 0;
1035 while (state->cp < state->cpend) {
1036 c = *state->cp;
1037 if (c < '0' || '7' < c)
1038 break;
1039 state->cp++;
1040 tmp = 8 * num + (uintN)JS7_UNDEC(c);
1041 if (tmp > 0377)
1042 break;
1043 num = tmp;
1044 }
1045 c = (jschar)num;
1046 }
1047 doFlat:
1048 state->result = NewRENode(state, REOP_FLAT);
1049 if (!state->result)
1050 return JS_FALSE;
1051 state->result->u.flat.chr = c;
1052 state->result->u.flat.length = 1;
1053 state->progLength += 3;
1054 break;
1055 case '1':
1056 case '2':
1057 case '3':
1058 case '4':
1059 case '5':
1060 case '6':
1061 case '7':
1062 case '8':
1063 case '9':
1064 termStart = state->cp - 1;
1065 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1066 if (state->flags & JSREG_FIND_PAREN_ERROR)
1067 return JS_FALSE;
1068 if (num == OVERFLOW_VALUE) {
1069 if (!JS_HAS_STRICT_OPTION(state->context)) {
1070 state->cp = termStart;
1071 goto doOctal;
1072 }
1073 if (!js_ReportCompileErrorNumber(state->context,
1074 state->tokenStream,
1075 NULL,
1076 JSREPORT_WARNING |
1077 JSREPORT_STRICT,
1078 (c >= '8')
1079 ? JSMSG_INVALID_BACKREF
1080 : JSMSG_BAD_BACKREF)) {
1081 return JS_FALSE;
1082 }
1083 num = 0x10000;
1084 }
1085 JS_ASSERT(1 <= num && num <= 0x10000);
1086 state->result = NewRENode(state, REOP_BACKREF);
1087 if (!state->result)
1088 return JS_FALSE;
1089 state->result->u.parenIndex = num - 1;
1090 state->progLength += 3;
1091 break;
1092 /* Control escape */
1093 case 'f':
1094 c = 0xC;
1095 goto doFlat;
1096 case 'n':
1097 c = 0xA;
1098 goto doFlat;
1099 case 'r':
1100 c = 0xD;
1101 goto doFlat;
1102 case 't':
1103 c = 0x9;
1104 goto doFlat;
1105 case 'v':
1106 c = 0xB;
1107 goto doFlat;
1108 /* Control letter */
1109 case 'c':
1110 if (state->cp + 1 < state->cpend && RE_IS_LETTER(state->cp[1])) {
1111 c = (jschar) (*state->cp++ & 0x1F);
1112 } else {
1113 /* back off to accepting the original '\' as a literal */
1114 --state->cp;
1115 c = '\\';
1116 }
1117 goto doFlat;
1118 /* HexEscapeSequence */
1119 case 'x':
1120 nDigits = 2;
1121 goto lexHex;
1122 /* UnicodeEscapeSequence */
1123 case 'u':
1124 nDigits = 4;
1125 lexHex:
1126 n = 0;
1127 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1128 uintN digit;
1129 c = *state->cp++;
1130 if (!isASCIIHexDigit(c, &digit)) {
1131 /*
1132 * back off to accepting the original
1133 * 'u' or 'x' as a literal
1134 */
1135 state->cp -= i + 2;
1136 n = *state->cp++;
1137 break;
1138 }
1139 n = (n << 4) | digit;
1140 }
1141 c = (jschar) n;
1142 goto doFlat;
1143 /* Character class escapes */
1144 case 'd':
1145 state->result = NewRENode(state, REOP_DIGIT);
1146 doSimple:
1147 if (!state->result)
1148 return JS_FALSE;
1149 state->progLength++;
1150 break;
1151 case 'D':
1152 state->result = NewRENode(state, REOP_NONDIGIT);
1153 goto doSimple;
1154 case 's':
1155 state->result = NewRENode(state, REOP_SPACE);
1156 goto doSimple;
1157 case 'S':
1158 state->result = NewRENode(state, REOP_NONSPACE);
1159 goto doSimple;
1160 case 'w':
1161 state->result = NewRENode(state, REOP_ALNUM);
1162 goto doSimple;
1163 case 'W':
1164 state->result = NewRENode(state, REOP_NONALNUM);
1165 goto doSimple;
1166 /* IdentityEscape */
1167 default:
1168 state->result = NewRENode(state, REOP_FLAT);
1169 if (!state->result)
1170 return JS_FALSE;
1171 state->result->u.flat.chr = c;
1172 state->result->u.flat.length = 1;
1173 state->result->kid = (void *) (state->cp - 1);
1174 state->progLength += 3;
1175 break;
1176 }
1177 break;
1178 case '[':
1179 state->result = NewRENode(state, REOP_CLASS);
1180 if (!state->result)
1181 return JS_FALSE;
1182 termStart = state->cp;
1183 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1184 while (JS_TRUE) {
1185 if (state->cp == state->cpend) {
1186 js_ReportCompileErrorNumber(state->context, state->tokenStream,
1187 NULL, JSREPORT_ERROR,
1188 JSMSG_UNTERM_CLASS, termStart);
1189 return JS_FALSE;
1190 }
1191 if (*state->cp == '\\') {
1192 state->cp++;
1193 } else {
1194 if (*state->cp == ']') {
1195 state->result->u.ucclass.kidlen = state->cp - termStart;
1196 break;
1197 }
1198 }
1199 state->cp++;
1200 }
1201 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1202 if (!state->classCache[i].start) {
1203 state->classCache[i].start = termStart;
1204 state->classCache[i].length = state->result->u.ucclass.kidlen;
1205 state->classCache[i].index = state->classCount;
1206 break;
1207 }
1208 if (state->classCache[i].length ==
1209 state->result->u.ucclass.kidlen) {
1210 for (n = 0; ; n++) {
1211 if (n == state->classCache[i].length) {
1212 state->result->u.ucclass.index =
1213 state->classCache[i].index;
1214 goto claim;
1215 }
1216 if (state->classCache[i].start[n] != termStart[n])
1217 break;
1218 }
1219 }
1220 }
1221 state->result->u.ucclass.index = state->classCount++;
1222 claim:
1223 /*
1224 * Call CalculateBitmapSize now as we want any errors it finds
1225 * to be reported during the parse phase, not at execution.
1226 */
1227 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1228 return JS_FALSE;
1229 state->progLength += 3; /* CLASS, <index> */
1230 break;
1232 case '.':
1233 state->result = NewRENode(state, REOP_DOT);
1234 goto doSimple;
1235 case '*':
1236 case '+':
1237 case '?':
1238 js_ReportCompileErrorNumber(state->context, state->tokenStream,
1239 NULL, JSREPORT_ERROR,
1240 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1241 return JS_FALSE;
1242 default:
1243 state->result = NewRENode(state, REOP_FLAT);
1244 if (!state->result)
1245 return JS_FALSE;
1246 state->result->u.flat.chr = c;
1247 state->result->u.flat.length = 1;
1248 state->result->kid = (void *) (state->cp - 1);
1249 state->progLength += 3;
1250 break;
1251 }
1252 return ParseQuantifier(state);
1253 }
1255 static JSBool
1256 ParseQuantifier(CompilerState *state)
1257 {
1258 RENode *term;
1259 term = state->result;
1260 if (state->cp < state->cpend) {
1261 switch (*state->cp) {
1262 case '+':
1263 state->result = NewRENode(state, REOP_QUANT);
1264 if (!state->result)
1265 return JS_FALSE;
1266 state->result->u.range.min = 1;
1267 state->result->u.range.max = (uint16)-1;
1268 /* <PLUS>, <next> ... <ENDCHILD> */
1269 state->progLength += 4;
1270 goto quantifier;
1271 case '*':
1272 state->result = NewRENode(state, REOP_QUANT);
1273 if (!state->result)
1274 return JS_FALSE;
1275 state->result->u.range.min = 0;
1276 state->result->u.range.max = (uint16)-1;
1277 /* <STAR>, <next> ... <ENDCHILD> */
1278 state->progLength += 4;
1279 goto quantifier;
1280 case '?':
1281 state->result = NewRENode(state, REOP_QUANT);
1282 if (!state->result)
1283 return JS_FALSE;
1284 state->result->u.range.min = 0;
1285 state->result->u.range.max = 1;
1286 /* <OPT>, <next> ... <ENDCHILD> */
1287 state->progLength += 4;
1288 goto quantifier;
1289 case '{': /* balance '}' */
1290 {
1291 intN err;
1292 uintN min, max;
1293 jschar c;
1294 const jschar *errp = state->cp++;
1296 c = *state->cp;
1297 if (JS7_ISDEC(c)) {
1298 ++state->cp;
1299 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1300 c = *state->cp;
1302 if (min == OVERFLOW_VALUE) {
1303 err = JSMSG_MIN_TOO_BIG;
1304 goto quantError;
1305 }
1306 if (c == ',') {
1307 c = *++state->cp;
1308 if (JS7_ISDEC(c)) {
1309 ++state->cp;
1310 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1311 c = *state->cp;
1312 if (max == OVERFLOW_VALUE) {
1313 err = JSMSG_MAX_TOO_BIG;
1314 goto quantError;
1315 }
1316 if (min > max) {
1317 err = JSMSG_OUT_OF_ORDER;
1318 goto quantError;
1319 }
1320 } else {
1321 max = (uintN)-1;
1322 }
1323 } else {
1324 max = min;
1325 }
1326 if (c == '}') {
1327 state->result = NewRENode(state, REOP_QUANT);
1328 if (!state->result)
1329 return JS_FALSE;
1330 state->result->u.range.min = min;
1331 state->result->u.range.max = max;
1332 /* QUANT, <min>, <max>, <next> ... <ENDCHILD> */
1333 state->progLength += 8;
1334 goto quantifier;
1335 }
1336 }
1337 state->cp = errp;
1338 return JS_TRUE;
1339 quantError:
1340 js_ReportCompileErrorNumber(state->context,
1341 state->tokenStream,
1342 NULL, JSREPORT_ERROR,
1343 err, errp);
1344 return JS_FALSE;
1345 }
1346 }
1347 }
1348 return JS_TRUE;
1350 quantifier:
1351 ++state->treeDepth;
1352 ++state->cp;
1353 state->result->kid = term;
1354 if (state->cp < state->cpend && *state->cp == '?') {
1355 ++state->cp;
1356 state->result->u.range.greedy = JS_FALSE;
1357 }
1358 else
1359 state->result->u.range.greedy = JS_TRUE;
1360 return JS_TRUE;
1361 }
1363 #define CHECK_OFFSET(diff) (JS_ASSERT(((diff) >= -32768) && ((diff) <= 32767)))
1364 #define SET_OFFSET(pc,off) ((pc)[0] = JUMP_OFFSET_HI(off), \
1365 (pc)[1] = JUMP_OFFSET_LO(off))
1366 #define GET_OFFSET(pc) ((int16)(((pc)[0] << 8) | (pc)[1]))
1367 #define OFFSET_LEN (2)
1368 #define GET_ARG(pc) GET_OFFSET(pc)
1369 #define SET_ARG(pc,arg) SET_OFFSET(pc,arg)
1370 #define ARG_LEN OFFSET_LEN
1372 /*
1373 * Recursively generate bytecode for the tree rooted at t. Iteratively.
1374 */
1376 typedef struct {
1377 RENode *nextAlt;
1378 jsbytecode *nextAltFixup, *nextTermFixup, *endTermFixup;
1379 RENode *continueNode;
1380 REOp continueOp;
1381 } EmitStateStackEntry;
1383 static jsbytecode *
1384 EmitREBytecode(CompilerState *state, JSRegExp *re, intN treeDepth,
1385 jsbytecode *pc, RENode *t)
1386 {
1387 ptrdiff_t diff;
1388 RECharSet *charSet;
1389 EmitStateStackEntry *emitStateSP, *emitStateStack = NULL;
1390 REOp op;
1392 if (treeDepth) {
1393 emitStateStack =
1394 (EmitStateStackEntry *)JS_malloc(state->context,
1395 sizeof(EmitStateStackEntry) *
1396 treeDepth);
1397 if (!emitStateStack)
1398 return NULL;
1399 }
1400 emitStateSP = emitStateStack;
1401 op = t->op;
1403 while (JS_TRUE) {
1404 *pc++ = op;
1405 switch (op) {
1406 case REOP_EMPTY:
1407 --pc;
1408 break;
1410 case REOP_ALTPREREQ2:
1411 case REOP_ALTPREREQ:
1412 JS_ASSERT(emitStateSP);
1413 emitStateSP->endTermFixup = pc;
1414 pc += OFFSET_LEN;
1415 SET_ARG(pc, t->u.altprereq.ch1);
1416 pc += ARG_LEN;
1417 SET_ARG(pc, t->u.altprereq.ch2);
1418 pc += ARG_LEN;
1420 emitStateSP->nextAltFixup = pc; /* address of next alternate */
1421 pc += OFFSET_LEN;
1423 emitStateSP->continueNode = t;
1424 emitStateSP->continueOp = REOP_JUMP;
1425 ++emitStateSP;
1426 JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth);
1427 t = (RENode *) t->kid;
1428 op = t->op;
1429 continue;
1431 case REOP_JUMP:
1432 emitStateSP->nextTermFixup = pc; /* address of following term */
1433 pc += OFFSET_LEN;
1434 diff = pc - emitStateSP->nextAltFixup;
1435 CHECK_OFFSET(diff);
1436 SET_OFFSET(emitStateSP->nextAltFixup, diff);
1437 emitStateSP->continueOp = REOP_ENDALT;
1438 ++emitStateSP;
1439 JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth);
1440 t = (RENode *) t->u.kid2;
1441 op = t->op;
1442 continue;
1444 case REOP_ENDALT:
1445 diff = pc - emitStateSP->nextTermFixup;
1446 CHECK_OFFSET(diff);
1447 SET_OFFSET(emitStateSP->nextTermFixup, diff);
1448 if (t->op != REOP_ALT) {
1449 diff = pc - emitStateSP->endTermFixup;
1450 CHECK_OFFSET(diff);
1451 SET_OFFSET(emitStateSP->endTermFixup, diff);
1452 }
1453 break;
1455 case REOP_ALT:
1456 JS_ASSERT(emitStateSP);
1457 emitStateSP->nextAltFixup = pc; /* address of pointer to next alternate */
1458 pc += OFFSET_LEN;
1459 emitStateSP->continueNode = t;
1460 emitStateSP->continueOp = REOP_JUMP;
1461 ++emitStateSP;
1462 JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth);
1463 t = (RENode *) t->kid;
1464 op = t->op;
1465 continue;
1467 case REOP_FLAT:
1468 /*
1469 * Consecutize FLAT's if possible.
1470 */
1471 if (t->kid) {
1472 while (t->next &&
1473 t->next->op == REOP_FLAT &&
1474 (jschar*)t->kid + t->u.flat.length ==
1475 (jschar*)t->next->kid) {
1476 t->u.flat.length += t->next->u.flat.length;
1477 t->next = t->next->next;
1478 }
1479 }
1480 if (t->kid && (t->u.flat.length > 1)) {
1481 if (state->flags & JSREG_FOLD)
1482 pc[-1] = REOP_FLATi;
1483 else
1484 pc[-1] = REOP_FLAT;
1485 SET_ARG(pc, (jschar *)t->kid - state->cpbegin);
1486 pc += ARG_LEN;
1487 SET_ARG(pc, t->u.flat.length);
1488 pc += ARG_LEN;
1489 } else if (t->u.flat.chr < 256) {
1490 if (state->flags & JSREG_FOLD)
1491 pc[-1] = REOP_FLAT1i;
1492 else
1493 pc[-1] = REOP_FLAT1;
1494 *pc++ = (jsbytecode) t->u.flat.chr;
1495 } else {
1496 if (state->flags & JSREG_FOLD)
1497 pc[-1] = REOP_UCFLAT1i;
1498 else
1499 pc[-1] = REOP_UCFLAT1;
1500 SET_ARG(pc, t->u.flat.chr);
1501 pc += ARG_LEN;
1502 }
1503 break;
1505 case REOP_LPAREN:
1506 JS_ASSERT(emitStateSP);
1507 SET_ARG(pc, t->u.parenIndex);
1508 pc += ARG_LEN;
1509 emitStateSP->continueNode = t;
1510 emitStateSP->continueOp = REOP_RPAREN;
1511 ++emitStateSP;
1512 JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth);
1513 t = (RENode *) t->kid;
1514 op = t->op;
1515 continue;
1516 case REOP_RPAREN:
1517 SET_ARG(pc, t->u.parenIndex);
1518 pc += ARG_LEN;
1519 break;
1521 case REOP_BACKREF:
1522 SET_ARG(pc, t->u.parenIndex);
1523 pc += ARG_LEN;
1524 break;
1525 case REOP_ASSERT:
1526 JS_ASSERT(emitStateSP);
1527 emitStateSP->nextTermFixup = pc;
1528 pc += OFFSET_LEN;
1529 emitStateSP->continueNode = t;
1530 emitStateSP->continueOp = REOP_ASSERTTEST;
1531 ++emitStateSP;
1532 JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth);
1533 t = (RENode *) t->kid;
1534 op = t->op;
1535 continue;
1536 case REOP_ASSERTTEST:
1537 case REOP_ASSERTNOTTEST:
1538 diff = pc - emitStateSP->nextTermFixup;
1539 CHECK_OFFSET(diff);
1540 SET_OFFSET(emitStateSP->nextTermFixup, diff);
1541 break;
1542 case REOP_ASSERT_NOT:
1543 JS_ASSERT(emitStateSP);
1544 emitStateSP->nextTermFixup = pc;
1545 pc += OFFSET_LEN;
1546 emitStateSP->continueNode = t;
1547 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
1548 ++emitStateSP;
1549 JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth);
1550 t = (RENode *) t->kid;
1551 op = t->op;
1552 continue;
1553 case REOP_QUANT:
1554 JS_ASSERT(emitStateSP);
1555 if (t->u.range.min == 0 && t->u.range.max == (uint16)-1) {
1556 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
1557 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
1558 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
1559 } else if (t->u.range.min == 1 && t->u.range.max == (uint16) -1) {
1560 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
1561 } else {
1562 if (!t->u.range.greedy)
1563 pc[-1] = REOP_MINIMALQUANT;
1564 SET_ARG(pc, t->u.range.min);
1565 pc += ARG_LEN;
1566 SET_ARG(pc, t->u.range.max);
1567 pc += ARG_LEN;
1568 }
1569 emitStateSP->nextTermFixup = pc;
1570 pc += OFFSET_LEN;
1571 emitStateSP->continueNode = t;
1572 emitStateSP->continueOp = REOP_ENDCHILD;
1573 ++emitStateSP;
1574 JS_ASSERT((emitStateSP - emitStateStack) <= treeDepth);
1575 t = (RENode *) t->kid;
1576 op = t->op;
1577 continue;
1578 case REOP_ENDCHILD:
1579 diff = pc - emitStateSP->nextTermFixup;
1580 CHECK_OFFSET(diff);
1581 SET_OFFSET(emitStateSP->nextTermFixup, diff);
1582 break;
1583 case REOP_CLASS:
1584 if (!t->u.ucclass.sense)
1585 pc[-1] = REOP_NCLASS;
1586 SET_ARG(pc, t->u.ucclass.index);
1587 pc += ARG_LEN;
1588 charSet = &re->classList[t->u.ucclass.index];
1589 charSet->converted = JS_FALSE;
1590 charSet->length = t->u.ucclass.bmsize;
1591 charSet->u.src.startIndex = t->u.ucclass.startIndex;
1592 charSet->u.src.length = t->u.ucclass.kidlen;
1593 charSet->sense = t->u.ucclass.sense;
1594 break;
1595 default:
1596 break;
1597 }
1598 t = t->next;
1599 if (!t) {
1600 if (emitStateSP == emitStateStack)
1601 break;
1602 --emitStateSP;
1603 t = emitStateSP->continueNode;
1604 op = emitStateSP->continueOp;
1605 }
1606 else
1607 op = t->op;
1608 }
1609 if (emitStateStack)
1610 JS_free(state->context, emitStateStack);
1611 return pc;
1612 }
1615 JSRegExp *
1616 js_NewRegExp(JSContext *cx, JSTokenStream *ts,
1617 JSString *str, uintN flags, JSBool flat)
1618 {
1619 JSRegExp *re;
1620 void *mark;
1621 CompilerState state;
1622 size_t resize;
1623 jsbytecode *endPC;
1624 uintN i;
1625 size_t len;
1627 re = NULL;
1628 mark = JS_ARENA_MARK(&cx->tempPool);
1630 state.context = cx;
1631 state.tokenStream = ts;
1632 state.cpbegin = state.cp = JSSTRING_CHARS(str);
1633 state.cpend = state.cp + JSSTRING_LENGTH(str);
1634 state.flags = flags;
1635 state.parenCount = 0;
1636 state.classCount = 0;
1637 state.progLength = 0;
1638 state.treeDepth = 0;
1639 for (i = 0; i < CLASS_CACHE_SIZE; i++)
1640 state.classCache[i].start = NULL;
1642 len = JSSTRING_LENGTH(str);
1644 if (len != 0 && flat) {
1645 state.result = NewRENode(&state, REOP_FLAT);
1646 state.result->u.flat.chr = *state.cpbegin;
1647 state.result->u.flat.length = JSSTRING_LENGTH(str);
1648 state.result->kid = (void *) state.cpbegin;
1649 state.progLength += 5;
1650 } else {
1651 if (!ParseRegExp(&state))
1652 goto out;
1653 }
1654 resize = sizeof *re + state.progLength + 1;
1655 re = (JSRegExp *) JS_malloc(cx, JS_ROUNDUP(resize, sizeof(jsword)));
1656 if (!re)
1657 goto out;
1659 re->nrefs = 1;
1660 re->classCount = state.classCount;
1661 if (re->classCount) {
1662 re->classList = (RECharSet *)
1663 JS_malloc(cx, re->classCount * sizeof(RECharSet));
1664 if (!re->classList) {
1665 js_DestroyRegExp(cx, re);
1666 re = NULL;
1667 goto out;
1668 }
1669 } else {
1670 re->classList = NULL;
1671 }
1672 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
1673 if (!endPC) {
1674 js_DestroyRegExp(cx, re);
1675 re = NULL;
1676 goto out;
1677 }
1678 *endPC++ = REOP_END;
1679 JS_ASSERT(endPC <= (re->program + (state.progLength + 1)));
1681 re->flags = flags;
1682 re->cloneIndex = 0;
1683 re->parenCount = state.parenCount;
1684 re->source = str;
1686 out:
1687 JS_ARENA_RELEASE(&cx->tempPool, mark);
1688 return re;
1689 }
1691 JSRegExp *
1692 js_NewRegExpOpt(JSContext *cx, JSTokenStream *ts,
1693 JSString *str, JSString *opt, JSBool flat)
1694 {
1695 uintN flags;
1696 jschar *s;
1697 size_t i, n;
1698 char charBuf[2];
1700 flags = 0;
1701 if (opt) {
1702 s = JSSTRING_CHARS(opt);
1703 for (i = 0, n = JSSTRING_LENGTH(opt); i < n; i++) {
1704 switch (s[i]) {
1705 case 'g':
1706 flags |= JSREG_GLOB;
1707 break;
1708 case 'i':
1709 flags |= JSREG_FOLD;
1710 break;
1711 case 'm':
1712 flags |= JSREG_MULTILINE;
1713 break;
1714 default:
1715 charBuf[0] = (char)s[i];
1716 charBuf[1] = '\0';
1717 js_ReportCompileErrorNumber(cx, ts, NULL, JSREPORT_ERROR,
1718 JSMSG_BAD_FLAG, charBuf);
1719 return NULL;
1720 }
1721 }
1722 }
1723 return js_NewRegExp(cx, ts, str, flags, flat);
1724 }
1727 #define HOLD_REGEXP(cx, re) JS_ATOMIC_INCREMENT(&(re)->nrefs)
1728 #define DROP_REGEXP(cx, re) js_DestroyRegExp(cx, re)
1730 /*
1731 * Save the current state of the match - the position in the input
1732 * text as well as the position in the bytecode. The state of any
1733 * parent expressions is also saved (preceding state).
1734 * Contents of parenCount parentheses from parenIndex are also saved.
1735 */
1736 static REBackTrackData *
1737 PushBackTrackState(REGlobalData *gData, REOp op,
1738 jsbytecode *target, REMatchState *x, const jschar *cp,
1739 uintN parenIndex, intN parenCount)
1740 {
1741 intN i;
1742 REBackTrackData *result =
1743 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1745 size_t sz = sizeof(REBackTrackData) +
1746 gData->stateStackTop * sizeof(REProgState) +
1747 parenCount * sizeof(RECapture);
1749 ptrdiff_t btsize = gData->backTrackStackSize;
1750 ptrdiff_t btincr = ((char *)result + sz) -
1751 ((char *)gData->backTrackStack + btsize);
1753 if (btincr > 0) {
1754 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1756 btincr = JS_ROUNDUP(btincr, btsize);
1757 JS_ARENA_GROW_CAST(gData->backTrackStack, REBackTrackData *,
1758 &gData->pool, btsize, btincr);
1759 if (!gData->backTrackStack)
1760 return NULL;
1761 gData->backTrackStackSize = btsize + btincr;
1762 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1763 }
1764 gData->backTrackSP = result;
1765 result->sz = gData->cursz;
1766 gData->cursz = sz;
1768 result->backtrack_op = op;
1769 result->backtrack_pc = target;
1770 result->cp = cp;
1771 result->parenCount = parenCount;
1773 result->saveStateStackTop = gData->stateStackTop;
1774 JS_ASSERT(gData->stateStackTop);
1775 memcpy(result + 1, gData->stateStack,
1776 sizeof(REProgState) * result->saveStateStackTop);
1778 /* FIXME: parenCount should be uintN */
1779 JS_ASSERT(parenCount >= 0);
1780 if (parenCount > 0) {
1781 result->parenIndex = parenIndex;
1782 memcpy((char *)(result + 1) +
1783 sizeof(REProgState) * result->saveStateStackTop,
1784 &x->parens[parenIndex],
1785 sizeof(RECapture) * parenCount);
1786 for (i = 0; i < parenCount; i++)
1787 x->parens[parenIndex + i].index = -1;
1788 }
1790 return result;
1791 }
1794 /*
1795 * Consecutive literal characters.
1796 */
1797 #if 0
1798 static REMatchState *
1799 FlatNMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars,
1800 intN length)
1801 {
1802 intN i;
1803 if (x->cp + length > gData->cpend)
1804 return NULL;
1805 for (i = 0; i < length; i++) {
1806 if (matchChars[i] != x->cp[i])
1807 return NULL;
1808 }
1809 x->cp += length;
1810 return x;
1811 }
1812 #endif
1814 static REMatchState *
1815 FlatNIMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars,
1816 intN length)
1817 {
1818 intN i;
1819 if (x->cp + length > gData->cpend)
1820 return NULL;
1821 for (i = 0; i < length; i++) {
1822 if (upcase(matchChars[i]) != upcase(x->cp[i]))
1823 return NULL;
1824 }
1825 x->cp += length;
1826 return x;
1827 }
1829 /*
1830 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
1831 * 2. If E is not a character then go to step 6.
1832 * 3. Let ch be E's character.
1833 * 4. Let A be a one-element RECharSet containing the character ch.
1834 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
1835 * 6. E must be an integer. Let n be that integer.
1836 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
1837 * 8. Return an internal Matcher closure that takes two arguments, a State x
1838 * and a Continuation c, and performs the following:
1839 * 1. Let cap be x's captures internal array.
1840 * 2. Let s be cap[n].
1841 * 3. If s is undefined, then call c(x) and return its result.
1842 * 4. Let e be x's endIndex.
1843 * 5. Let len be s's length.
1844 * 6. Let f be e+len.
1845 * 7. If f>InputLength, return failure.
1846 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
1847 * such that Canonicalize(s[i]) is not the same character as
1848 * Canonicalize(Input [e+i]), then return failure.
1849 * 9. Let y be the State (f, cap).
1850 * 10. Call c(y) and return its result.
1851 */
1852 static REMatchState *
1853 BackrefMatcher(REGlobalData *gData, REMatchState *x, uintN parenIndex)
1854 {
1855 uintN len;
1856 uintN i;
1857 const jschar *parenContent;
1858 RECapture *s = &x->parens[parenIndex];
1859 if (s->index == -1)
1860 return x;
1862 len = s->length;
1863 if (x->cp + len > gData->cpend)
1864 return NULL;
1866 parenContent = &gData->cpbegin[s->index];
1867 if (gData->regexp->flags & JSREG_FOLD) {
1868 for (i = 0; i < len; i++) {
1869 if (upcase(parenContent[i]) != upcase(x->cp[i]))
1870 return NULL;
1871 }
1872 } else {
1873 for (i = 0; i < len; i++) {
1874 if (parenContent[i] != x->cp[i])
1875 return NULL;
1876 }
1877 }
1878 x->cp += len;
1879 return x;
1880 }
1883 /* Add a single character to the RECharSet */
1884 static void
1885 AddCharacterToCharSet(RECharSet *cs, jschar c)
1886 {
1887 uintN byteIndex = (uintN)(c >> 3);
1888 JS_ASSERT(c <= cs->length);
1889 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
1890 }
1893 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
1894 static void
1895 AddCharacterRangeToCharSet(RECharSet *cs, jschar c1, jschar c2)
1896 {
1897 uintN i;
1899 uintN byteIndex1 = (uintN)(c1 >> 3);
1900 uintN byteIndex2 = (uintN)(c2 >> 3);
1902 JS_ASSERT((c2 <= cs->length) && (c1 <= c2));
1904 c1 &= 0x7;
1905 c2 &= 0x7;
1907 if (byteIndex1 == byteIndex2) {
1908 cs->u.bits[byteIndex1] |= ((uint8)0xFF >> (7 - (c2 - c1))) << c1;
1909 } else {
1910 cs->u.bits[byteIndex1] |= 0xFF << c1;
1911 for (i = byteIndex1 + 1; i < byteIndex2; i++)
1912 cs->u.bits[i] = 0xFF;
1913 cs->u.bits[byteIndex2] |= (uint8)0xFF >> (7 - c2);
1914 }
1915 }
1917 /* Compile the source of the class into a RECharSet */
1918 static JSBool
1919 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
1920 {
1921 const jschar *src = JSSTRING_CHARS(gData->regexp->source) +
1922 charSet->u.src.startIndex;
1923 const jschar *end = src + charSet->u.src.length;
1925 JSBool inRange = JS_FALSE;
1926 jschar rangeStart = 0;
1927 uintN byteLength, n;
1928 jschar c, thisCh;
1929 intN nDigits, i;
1931 JS_ASSERT(!charSet->converted);
1932 charSet->converted = JS_TRUE;
1934 byteLength = (charSet->length >> 3) + 1;
1935 charSet->u.bits = (uint8 *)JS_malloc(gData->cx, byteLength);
1936 if (!charSet->u.bits)
1937 return JS_FALSE;
1938 memset(charSet->u.bits, 0, byteLength);
1940 if (src == end)
1941 return JS_TRUE;
1943 if (*src == '^') {
1944 JS_ASSERT(charSet->sense == JS_FALSE);
1945 ++src;
1946 }
1947 else
1948 JS_ASSERT(charSet->sense == JS_TRUE);
1951 while (src != end) {
1952 switch (*src) {
1953 case '\\':
1954 ++src;
1955 c = *src++;
1956 switch (c) {
1957 case 'b':
1958 thisCh = 0x8;
1959 break;
1960 case 'f':
1961 thisCh = 0xC;
1962 break;
1963 case 'n':
1964 thisCh = 0xA;
1965 break;
1966 case 'r':
1967 thisCh = 0xD;
1968 break;
1969 case 't':
1970 thisCh = 0x9;
1971 break;
1972 case 'v':
1973 thisCh = 0xB;
1974 break;
1975 case 'c':
1976 if (src + 1 < end && JS_ISWORD(src[1])) {
1977 thisCh = (jschar)(*src++ & 0x1F);
1978 } else {
1979 --src;
1980 thisCh = '\\';
1981 }
1982 break;
1983 case 'x':
1984 nDigits = 2;
1985 goto lexHex;
1986 case 'u':
1987 nDigits = 4;
1988 lexHex:
1989 n = 0;
1990 for (i = 0; (i < nDigits) && (src < end); i++) {
1991 uintN digit;
1992 c = *src++;
1993 if (!isASCIIHexDigit(c, &digit)) {
1994 /*
1995 * Back off to accepting the original '\'
1996 * as a literal
1997 */
1998 src -= i + 1;
1999 n = '\\';
2000 break;
2001 }
2002 n = (n << 4) | digit;
2003 }
2004 thisCh = (jschar)n;
2005 break;
2006 case '0':
2007 case '1':
2008 case '2':
2009 case '3':
2010 case '4':
2011 case '5':
2012 case '6':
2013 case '7':
2014 /*
2015 * This is a non-ECMA extension - decimal escapes (in this
2016 * case, octal!) are supposed to be an error inside class
2017 * ranges, but supported here for backwards compatibility.
2018 *
2019 */
2020 n = JS7_UNDEC(c);
2021 c = *src;
2022 if ('0' <= c && c <= '7') {
2023 src++;
2024 n = 8 * n + JS7_UNDEC(c);
2025 c = *src;
2026 if ('0' <= c && c <= '7') {
2027 src++;
2028 i = 8 * n + JS7_UNDEC(c);
2029 if (i <= 0377)
2030 n = i;
2031 else
2032 src--;
2033 }
2034 }
2035 thisCh = (jschar)n;
2036 break;
2038 case 'd':
2039 AddCharacterRangeToCharSet(charSet, '0', '9');
2040 continue; /* don't need range processing */
2041 case 'D':
2042 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2043 AddCharacterRangeToCharSet(charSet,
2044 (jschar)('9' + 1),
2045 (jschar)charSet->length);
2046 continue;
2047 case 's':
2048 for (i = (intN)charSet->length; i >= 0; i--)
2049 if (JS_ISSPACE(i))
2050 AddCharacterToCharSet(charSet, (jschar)i);
2051 continue;
2052 case 'S':
2053 for (i = (intN)charSet->length; i >= 0; i--)
2054 if (!JS_ISSPACE(i))
2055 AddCharacterToCharSet(charSet, (jschar)i);
2056 continue;
2057 case 'w':
2058 for (i = (intN)charSet->length; i >= 0; i--)
2059 if (JS_ISWORD(i))
2060 AddCharacterToCharSet(charSet, (jschar)i);
2061 continue;
2062 case 'W':
2063 for (i = (intN)charSet->length; i >= 0; i--)
2064 if (!JS_ISWORD(i))
2065 AddCharacterToCharSet(charSet, (jschar)i);
2066 continue;
2067 default:
2068 thisCh = c;
2069 break;
2071 }
2072 break;
2074 default:
2075 thisCh = *src++;
2076 break;
2078 }
2079 if (inRange) {
2080 if (gData->regexp->flags & JSREG_FOLD) {
2081 AddCharacterRangeToCharSet(charSet, upcase(rangeStart),
2082 upcase(thisCh));
2083 AddCharacterRangeToCharSet(charSet, downcase(rangeStart),
2084 downcase(thisCh));
2085 } else {
2086 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2087 }
2088 inRange = JS_FALSE;
2089 } else {
2090 if (gData->regexp->flags & JSREG_FOLD) {
2091 AddCharacterToCharSet(charSet, upcase(thisCh));
2092 AddCharacterToCharSet(charSet, downcase(thisCh));
2093 } else {
2094 AddCharacterToCharSet(charSet, thisCh);
2095 }
2096 if (src < end - 1) {
2097 if (*src == '-') {
2098 ++src;
2099 inRange = JS_TRUE;
2100 rangeStart = thisCh;
2101 }
2102 }
2103 }
2104 }
2105 return JS_TRUE;
2106 }
2108 void
2109 js_DestroyRegExp(JSContext *cx, JSRegExp *re)
2110 {
2111 if (JS_ATOMIC_DECREMENT(&re->nrefs) == 0) {
2112 if (re->classList) {
2113 uintN i;
2114 for (i = 0; i < re->classCount; i++) {
2115 if (re->classList[i].converted)
2116 JS_free(cx, re->classList[i].u.bits);
2117 re->classList[i].u.bits = NULL;
2118 }
2119 JS_free(cx, re->classList);
2120 }
2121 JS_free(cx, re);
2122 }
2123 }
2125 static JSBool
2126 ReallocStateStack(REGlobalData *gData)
2127 {
2128 uint16 limit = gData->stateStackLimit;
2129 size_t sz = sizeof(REProgState) * limit;
2131 JS_ARENA_GROW_CAST(gData->stateStack, REProgState *, &gData->pool, sz, sz);
2132 if (!gData->stateStack) {
2133 gData->ok = JS_FALSE;
2134 return JS_FALSE;
2135 }
2136 gData->stateStackLimit = limit + limit;
2137 return JS_TRUE;
2138 }
2140 #define PUSH_STATE_STACK(data) \
2141 JS_BEGIN_MACRO \
2142 ++(data)->stateStackTop; \
2143 if ((data)->stateStackTop == (data)->stateStackLimit && \
2144 !ReallocStateStack((data))) { \
2145 return NULL; \
2146 } \
2147 JS_END_MACRO
2149 /*
2150 * Apply the current op against the given input to see if it's going to match
2151 * or fail. Return false if we don't get a match, true if we do and update the
2152 * state of the input and pc if the update flag is true.
2153 */
2154 static REMatchState *
2155 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2156 jsbytecode **startpc, JSBool update)
2157 {
2158 REMatchState *result = NULL;
2159 jschar matchCh;
2160 uintN parenIndex;
2161 intN offset, length, index;
2162 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2163 jschar *source;
2164 const jschar *startcp = x->cp;
2165 jschar ch;
2166 RECharSet *charSet;
2168 switch (op) {
2169 case REOP_BOL:
2170 if (x->cp != gData->cpbegin) {
2171 if (!gData->cx->regExpStatics.multiline &&
2172 !(gData->regexp->flags & JSREG_MULTILINE)) {
2173 break;
2174 }
2175 if (!RE_IS_LINE_TERM(x->cp[-1]))
2176 break;
2177 }
2178 result = x;
2179 break;
2180 case REOP_EOL:
2181 if (x->cp != gData->cpend) {
2182 if (!gData->cx->regExpStatics.multiline &&
2183 !(gData->regexp->flags & JSREG_MULTILINE)) {
2184 break;
2185 }
2186 if (!RE_IS_LINE_TERM(*x->cp))
2187 break;
2188 }
2189 result = x;
2190 break;
2191 case REOP_WBDRY:
2192 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2193 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2194 result = x;
2195 }
2196 break;
2197 case REOP_WNONBDRY:
2198 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2199 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2200 result = x;
2201 }
2202 break;
2203 case REOP_DOT:
2204 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2205 result = x;
2206 result->cp++;
2207 }
2208 break;
2209 case REOP_DIGIT:
2210 if (x->cp != gData->cpend && JS_ISDIGIT(*x->cp)) {
2211 result = x;
2212 result->cp++;
2213 }
2214 break;
2215 case REOP_NONDIGIT:
2216 if (x->cp != gData->cpend && !JS_ISDIGIT(*x->cp)) {
2217 result = x;
2218 result->cp++;
2219 }
2220 break;
2221 case REOP_ALNUM:
2222 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2223 result = x;
2224 result->cp++;
2225 }
2226 break;
2227 case REOP_NONALNUM:
2228 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2229 result = x;
2230 result->cp++;
2231 }
2232 break;
2233 case REOP_SPACE:
2234 if (x->cp != gData->cpend && JS_ISSPACE(*x->cp)) {
2235 result = x;
2236 result->cp++;
2237 }
2238 break;
2239 case REOP_NONSPACE:
2240 if (x->cp != gData->cpend && !JS_ISSPACE(*x->cp)) {
2241 result = x;
2242 result->cp++;
2243 }
2244 break;
2245 case REOP_BACKREF:
2246 parenIndex = GET_ARG(pc);
2247 pc += ARG_LEN;
2248 result = BackrefMatcher(gData, x, parenIndex);
2249 break;
2250 case REOP_FLAT:
2251 offset = GET_ARG(pc);
2252 pc += ARG_LEN;
2253 length = GET_ARG(pc);
2254 pc += ARG_LEN;
2255 source = JSSTRING_CHARS(gData->regexp->source) + offset;
2256 if (x->cp + length <= gData->cpend) {
2257 for (index = 0; index < length; index++) {
2258 if (source[index] != x->cp[index])
2259 return NULL;
2260 }
2261 x->cp += length;
2262 result = x;
2263 }
2264 break;
2265 case REOP_FLAT1:
2266 matchCh = *pc++;
2267 if (x->cp != gData->cpend && *x->cp == matchCh) {
2268 result = x;
2269 result->cp++;
2270 }
2271 break;
2272 case REOP_FLATi:
2273 offset = GET_ARG(pc);
2274 pc += ARG_LEN;
2275 length = GET_ARG(pc);
2276 pc += ARG_LEN;
2277 source = JSSTRING_CHARS(gData->regexp->source);
2278 result = FlatNIMatcher(gData, x, source + offset, length);
2279 break;
2280 case REOP_FLAT1i:
2281 matchCh = *pc++;
2282 if (x->cp != gData->cpend && upcase(*x->cp) == upcase(matchCh)) {
2283 result = x;
2284 result->cp++;
2285 }
2286 break;
2287 case REOP_UCFLAT1:
2288 matchCh = GET_ARG(pc);
2289 pc += ARG_LEN;
2290 if (x->cp != gData->cpend && *x->cp == matchCh) {
2291 result = x;
2292 result->cp++;
2293 }
2294 break;
2295 case REOP_UCFLAT1i:
2296 matchCh = GET_ARG(pc);
2297 pc += ARG_LEN;
2298 if (x->cp != gData->cpend && upcase(*x->cp) == upcase(matchCh)) {
2299 result = x;
2300 result->cp++;
2301 }
2302 break;
2303 case REOP_CLASS:
2304 index = GET_ARG(pc);
2305 pc += ARG_LEN;
2306 if (x->cp != gData->cpend) {
2307 charSet = &gData->regexp->classList[index];
2308 JS_ASSERT(charSet->converted);
2309 ch = *x->cp;
2310 index = ch >> 3;
2311 if (charSet->length != 0 &&
2312 ch <= charSet->length &&
2313 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2314 result = x;
2315 result->cp++;
2316 }
2317 }
2318 break;
2319 case REOP_NCLASS:
2320 index = GET_ARG(pc);
2321 pc += ARG_LEN;
2322 if (x->cp != gData->cpend) {
2323 charSet = &gData->regexp->classList[index];
2324 JS_ASSERT(charSet->converted);
2325 ch = *x->cp;
2326 index = ch >> 3;
2327 if (charSet->length == 0 ||
2328 ch > charSet->length ||
2329 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2330 result = x;
2331 result->cp++;
2332 }
2333 }
2334 break;
2335 default:
2336 JS_ASSERT(JS_FALSE);
2337 }
2338 if (result) {
2339 if (update)
2340 *startpc = pc;
2341 else
2342 x->cp = startcp;
2343 return result;
2344 }
2345 x->cp = startcp;
2346 return NULL;
2347 }
2349 static REMatchState *
2350 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
2351 {
2352 REMatchState *result = NULL;
2353 REBackTrackData *backTrackData;
2354 intN offset;
2355 jsbytecode *nextpc;
2356 REOp nextop;
2357 RECapture *cap;
2358 REProgState *curState;
2359 const jschar *startcp;
2360 uintN parenIndex, k;
2361 uintN parenSoFar = 0;
2363 jschar matchCh1, matchCh2;
2364 RECharSet *charSet;
2366 JSBool anchor;
2367 jsbytecode *pc = gData->regexp->program;
2368 REOp op = (REOp) *pc++;
2370 /*
2371 * If the first node is a simple match, step the index into the string
2372 * until that match is made, or fail if it can't be found at all.
2373 */
2374 if (REOP_IS_SIMPLE(op)) {
2375 anchor = JS_FALSE;
2376 while (x->cp <= gData->cpend) {
2377 nextpc = pc; /* reset back to start each time */
2378 result = SimpleMatch(gData, x, op, &nextpc, JS_TRUE);
2379 if (result) {
2380 anchor = JS_TRUE;
2381 x = result;
2382 pc = nextpc; /* accept skip to next opcode */
2383 op = (REOp) *pc++;
2384 break;
2385 }
2386 gData->skipped++;
2387 x->cp++;
2388 }
2389 if (!anchor)
2390 return NULL;
2391 }
2393 while (JS_TRUE) {
2394 if (REOP_IS_SIMPLE(op)) {
2395 result = SimpleMatch(gData, x, op, &pc, JS_TRUE);
2396 } else {
2397 curState = &gData->stateStack[gData->stateStackTop];
2398 switch (op) {
2399 case REOP_EMPTY:
2400 result = x;
2401 break;
2403 case REOP_ALTPREREQ2:
2404 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2405 pc += ARG_LEN;
2406 matchCh2 = GET_ARG(pc);
2407 pc += ARG_LEN;
2408 k = GET_ARG(pc);
2409 pc += ARG_LEN;
2411 if (x->cp != gData->cpend) {
2412 if (*x->cp == matchCh2)
2413 goto doAlt;
2415 charSet = &gData->regexp->classList[k];
2416 if (!charSet->converted)
2417 if (!ProcessCharSet(gData, charSet))
2418 return NULL;
2419 matchCh1 = *x->cp;
2420 k = matchCh1 >> 3;
2421 if ((charSet->length == 0 ||
2422 matchCh1 > charSet->length ||
2423 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2424 charSet->sense) {
2425 goto doAlt;
2426 }
2427 }
2428 result = NULL;
2429 break;
2431 case REOP_ALTPREREQ:
2432 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2433 pc += ARG_LEN;
2434 matchCh1 = GET_ARG(pc);
2435 pc += ARG_LEN;
2436 matchCh2 = GET_ARG(pc);
2437 pc += ARG_LEN;
2438 if (x->cp == gData->cpend ||
2439 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2440 result = NULL;
2441 break;
2442 }
2443 /* else false thru... */
2445 case REOP_ALT:
2446 doAlt:
2447 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2448 pc += ARG_LEN; /* start of this alternate */
2449 curState->parenSoFar = parenSoFar;
2450 PUSH_STATE_STACK(gData);
2451 op = (REOp) *pc++;
2452 startcp = x->cp;
2453 if (REOP_IS_SIMPLE(op)) {
2454 if (!SimpleMatch(gData, x, op, &pc, JS_TRUE)) {
2455 op = (REOp) *nextpc++;
2456 pc = nextpc;
2457 continue;
2458 }
2459 result = x;
2460 op = (REOp) *pc++;
2461 }
2462 nextop = (REOp) *nextpc++;
2463 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2464 return NULL;
2465 continue;
2467 /*
2468 * Occurs at (succesful) end of REOP_ALT,
2469 */
2470 case REOP_JUMP:
2471 --gData->stateStackTop;
2472 offset = GET_OFFSET(pc);
2473 pc += offset;
2474 op = (REOp) *pc++;
2475 continue;
2477 /*
2478 * Occurs at last (succesful) end of REOP_ALT,
2479 */
2480 case REOP_ENDALT:
2481 --gData->stateStackTop;
2482 op = (REOp) *pc++;
2483 continue;
2485 case REOP_LPAREN:
2486 parenIndex = GET_ARG(pc);
2487 if (parenIndex + 1 > parenSoFar)
2488 parenSoFar = parenIndex + 1;
2489 pc += ARG_LEN;
2490 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2491 x->parens[parenIndex].length = 0;
2492 op = (REOp) *pc++;
2493 continue;
2494 case REOP_RPAREN:
2495 parenIndex = GET_ARG(pc);
2496 pc += ARG_LEN;
2497 cap = &x->parens[parenIndex];
2498 cap->length = x->cp - (gData->cpbegin + cap->index);
2499 op = (REOp) *pc++;
2500 continue;
2502 case REOP_ASSERT:
2503 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2504 pc += ARG_LEN; /* start of ASSERT child */
2505 op = (REOp) *pc++;
2506 if (REOP_IS_SIMPLE(op) &&
2507 !SimpleMatch(gData, x, op, &pc, JS_FALSE)) {
2508 result = NULL;
2509 break;
2510 }
2511 curState->u.assertion.top =
2512 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2513 curState->u.assertion.sz = gData->cursz;
2514 curState->index = x->cp - gData->cpbegin;
2515 curState->parenSoFar = parenSoFar;
2516 PUSH_STATE_STACK(gData);
2517 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2518 nextpc, x, x->cp, 0, 0)) {
2519 return NULL;
2520 }
2521 continue;
2522 case REOP_ASSERT_NOT:
2523 nextpc = pc + GET_OFFSET(pc);
2524 pc += ARG_LEN;
2525 op = (REOp) *pc++;
2526 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2527 SimpleMatch(gData, x, op, &pc, JS_FALSE) &&
2528 pc == nextpc) {
2529 result = NULL;
2530 break;
2531 }
2532 curState->u.assertion.top
2533 = (char *)gData->backTrackSP -
2534 (char *)gData->backTrackStack;
2535 curState->u.assertion.sz = gData->cursz;
2536 curState->index = x->cp - gData->cpbegin;
2537 curState->parenSoFar = parenSoFar;
2538 PUSH_STATE_STACK(gData);
2539 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2540 nextpc, x, x->cp, 0, 0))
2541 return NULL;
2542 continue;
2543 case REOP_ASSERTTEST:
2544 --gData->stateStackTop;
2545 --curState;
2546 x->cp = gData->cpbegin + curState->index;
2547 gData->backTrackSP =
2548 (REBackTrackData *) ((char *)gData->backTrackStack +
2549 curState->u.assertion.top);
2550 gData->cursz = curState->u.assertion.sz;
2551 if (result)
2552 result = x;
2553 break;
2554 case REOP_ASSERTNOTTEST:
2555 --gData->stateStackTop;
2556 --curState;
2557 x->cp = gData->cpbegin + curState->index;
2558 gData->backTrackSP =
2559 (REBackTrackData *) ((char *)gData->backTrackStack +
2560 curState->u.assertion.top);
2561 gData->cursz = curState->u.assertion.sz;
2562 result = (!result) ? x : NULL;
2563 break;
2565 case REOP_END:
2566 if (x)
2567 return x;
2568 break;
2570 case REOP_STAR:
2571 curState->u.quantifier.min = 0;
2572 curState->u.quantifier.max = (uint16)-1;
2573 goto quantcommon;
2574 case REOP_PLUS:
2575 curState->u.quantifier.min = 1;
2576 curState->u.quantifier.max = (uint16)-1;
2577 goto quantcommon;
2578 case REOP_OPT:
2579 curState->u.quantifier.min = 0;
2580 curState->u.quantifier.max = 1;
2581 goto quantcommon;
2582 case REOP_QUANT:
2583 curState->u.quantifier.min = GET_ARG(pc);
2584 pc += ARG_LEN;
2585 curState->u.quantifier.max = GET_ARG(pc);
2586 pc += ARG_LEN;
2587 quantcommon:
2588 if (curState->u.quantifier.max == 0) {
2589 pc = pc + GET_OFFSET(pc);
2590 op = (REOp) *pc++;
2591 result = x;
2592 continue;
2593 }
2594 /* Step over <next> */
2595 nextpc = pc + ARG_LEN;
2596 op = (REOp) *nextpc++;
2597 startcp = x->cp;
2598 if (REOP_IS_SIMPLE(op)) {
2599 if (!SimpleMatch(gData, x, op, &nextpc, JS_TRUE)) {
2600 if (curState->u.quantifier.min == 0)
2601 result = x;
2602 else
2603 result = NULL;
2604 pc = pc + GET_OFFSET(pc);
2605 break;
2606 }
2607 op = (REOp) *nextpc++;
2608 result = x;
2609 }
2610 curState->index = startcp - gData->cpbegin;
2611 curState->continue_op = REOP_REPEAT;
2612 curState->continue_pc = pc;
2613 curState->parenSoFar = parenSoFar;
2614 PUSH_STATE_STACK(gData);
2615 if (curState->u.quantifier.min == 0 &&
2616 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2617 0, 0)) {
2618 return NULL;
2619 }
2620 pc = nextpc;
2621 continue;
2623 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2624 pc = curState[-1].continue_pc;
2625 op = curState[-1].continue_op;
2626 continue;
2628 case REOP_REPEAT:
2629 --curState;
2630 do {
2631 --gData->stateStackTop;
2632 if (!result) {
2633 /* Failed, see if we have enough children. */
2634 if (curState->u.quantifier.min == 0)
2635 goto repeatDone;
2636 goto break_switch;
2637 }
2638 if (curState->u.quantifier.min == 0 &&
2639 x->cp == gData->cpbegin + curState->index) {
2640 /* matched an empty string, that'll get us nowhere */
2641 result = NULL;
2642 goto break_switch;
2643 }
2644 if (curState->u.quantifier.min != 0)
2645 curState->u.quantifier.min--;
2646 if (curState->u.quantifier.max != (uint16) -1)
2647 curState->u.quantifier.max--;
2648 if (curState->u.quantifier.max == 0)
2649 goto repeatDone;
2650 nextpc = pc + ARG_LEN;
2651 nextop = (REOp) *nextpc;
2652 startcp = x->cp;
2653 if (REOP_IS_SIMPLE(nextop)) {
2654 nextpc++;
2655 if (!SimpleMatch(gData, x, nextop, &nextpc, JS_TRUE)) {
2656 if (curState->u.quantifier.min == 0)
2657 goto repeatDone;
2658 result = NULL;
2659 goto break_switch;
2660 }
2661 result = x;
2662 }
2663 curState->index = startcp - gData->cpbegin;
2664 PUSH_STATE_STACK(gData);
2665 if (curState->u.quantifier.min == 0 &&
2666 !PushBackTrackState(gData, REOP_REPEAT,
2667 pc, x, startcp,
2668 curState->parenSoFar,
2669 parenSoFar -
2670 curState->parenSoFar)) {
2671 return NULL;
2672 }
2673 } while (*nextpc == REOP_ENDCHILD);
2674 pc = nextpc;
2675 op = (REOp) *pc++;
2676 parenSoFar = curState->parenSoFar;
2677 continue;
2679 repeatDone:
2680 result = x;
2681 pc += GET_OFFSET(pc);
2682 goto break_switch;
2684 case REOP_MINIMALSTAR:
2685 curState->u.quantifier.min = 0;
2686 curState->u.quantifier.max = (uint16)-1;
2687 goto minimalquantcommon;
2688 case REOP_MINIMALPLUS:
2689 curState->u.quantifier.min = 1;
2690 curState->u.quantifier.max = (uint16)-1;
2691 goto minimalquantcommon;
2692 case REOP_MINIMALOPT:
2693 curState->u.quantifier.min = 0;
2694 curState->u.quantifier.max = 1;
2695 goto minimalquantcommon;
2696 case REOP_MINIMALQUANT:
2697 curState->u.quantifier.min = GET_ARG(pc);
2698 pc += ARG_LEN;
2699 curState->u.quantifier.max = GET_ARG(pc);
2700 pc += ARG_LEN;
2701 minimalquantcommon:
2702 curState->index = x->cp - gData->cpbegin;
2703 curState->parenSoFar = parenSoFar;
2704 PUSH_STATE_STACK(gData);
2705 if (curState->u.quantifier.min != 0) {
2706 curState->continue_op = REOP_MINIMALREPEAT;
2707 curState->continue_pc = pc;
2708 /* step over <next> */
2709 pc += ARG_LEN;
2710 op = (REOp) *pc++;
2711 } else {
2712 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2713 pc, x, x->cp, 0, 0)) {
2714 return NULL;
2715 }
2716 --gData->stateStackTop;
2717 pc = pc + GET_OFFSET(pc);
2718 op = (REOp) *pc++;
2719 }
2720 continue;
2722 case REOP_MINIMALREPEAT:
2723 --gData->stateStackTop;
2724 --curState;
2726 if (!result) {
2727 /*
2728 * Non-greedy failure - try to consume another child.
2729 */
2730 if (curState->u.quantifier.max == (uint16) -1 ||
2731 curState->u.quantifier.max > 0) {
2732 curState->index = x->cp - gData->cpbegin;
2733 curState->continue_op = REOP_MINIMALREPEAT;
2734 curState->continue_pc = pc;
2735 pc += ARG_LEN;
2736 for (k = curState->parenSoFar; k < parenSoFar; k++)
2737 x->parens[k].index = -1;
2738 PUSH_STATE_STACK(gData);
2739 op = (REOp) *pc++;
2740 continue;
2741 }
2742 /* Don't need to adjust pc since we're going to pop. */
2743 break;
2744 }
2745 if (curState->u.quantifier.min == 0 &&
2746 x->cp == gData->cpbegin + curState->index) {
2747 /* Matched an empty string, that'll get us nowhere. */
2748 result = NULL;
2749 break;
2750 }
2751 if (curState->u.quantifier.min != 0)
2752 curState->u.quantifier.min--;
2753 if (curState->u.quantifier.max != (uint16) -1)
2754 curState->u.quantifier.max--;
2755 if (curState->u.quantifier.min != 0) {
2756 curState->continue_op = REOP_MINIMALREPEAT;
2757 curState->continue_pc = pc;
2758 pc += ARG_LEN;
2759 for (k = curState->parenSoFar; k < parenSoFar; k++)
2760 x->parens[k].index = -1;
2761 curState->index = x->cp - gData->cpbegin;
2762 PUSH_STATE_STACK(gData);
2763 op = (REOp) *pc++;
2764 continue;
2765 }
2766 curState->index = x->cp - gData->cpbegin;
2767 curState->parenSoFar = parenSoFar;
2768 PUSH_STATE_STACK(gData);
2769 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2770 pc, x, x->cp,
2771 curState->parenSoFar,
2772 parenSoFar - curState->parenSoFar)) {
2773 return NULL;
2774 }
2775 --gData->stateStackTop;
2776 pc = pc + GET_OFFSET(pc);
2777 op = (REOp) *pc++;
2778 continue;
2780 default:
2781 JS_ASSERT(JS_FALSE);
2782 result = NULL;
2783 }
2784 break_switch:;
2785 }
2786 /*
2787 * If the match failed and there's a backtrack option, take it.
2788 * Otherwise this is a complete and utter failure.
2789 */
2790 if (!result) {
2791 if (gData->cursz == 0)
2792 return NULL;
2793 backTrackData = gData->backTrackSP;
2794 gData->cursz = backTrackData->sz;
2795 gData->backTrackSP =
2796 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
2797 x->cp = backTrackData->cp;
2798 pc = backTrackData->backtrack_pc;
2799 op = backTrackData->backtrack_op;
2800 gData->stateStackTop = backTrackData->saveStateStackTop;
2801 JS_ASSERT(gData->stateStackTop);
2803 memcpy(gData->stateStack, backTrackData + 1,
2804 sizeof(REProgState) * backTrackData->saveStateStackTop);
2805 curState = &gData->stateStack[gData->stateStackTop - 1];
2807 if (backTrackData->parenCount) {
2808 memcpy(&x->parens[backTrackData->parenIndex],
2809 (char *)(backTrackData + 1) +
2810 sizeof(REProgState) * backTrackData->saveStateStackTop,
2811 sizeof(RECapture) * backTrackData->parenCount);
2812 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
2813 } else {
2814 for (k = curState->parenSoFar; k < parenSoFar; k++)
2815 x->parens[k].index = -1;
2816 parenSoFar = curState->parenSoFar;
2817 }
2818 continue;
2819 }
2820 x = result;
2822 /*
2823 * Continue with the expression.
2824 */
2825 op = (REOp)*pc++;
2826 }
2827 return NULL;
2828 }
2830 static REMatchState *
2831 MatchRegExp(REGlobalData *gData, REMatchState *x)
2832 {
2833 REMatchState *result;
2834 const jschar *cp = x->cp;
2835 const jschar *cp2;
2836 uintN j;
2838 /*
2839 * Have to include the position beyond the last character
2840 * in order to detect end-of-input/line condition.
2841 */
2842 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
2843 gData->skipped = cp2 - cp;
2844 x->cp = cp2;
2845 for (j = 0; j < gData->regexp->parenCount; j++)
2846 x->parens[j].index = -1;
2847 result = ExecuteREBytecode(gData, x);
2848 if (!gData->ok || result)
2849 return result;
2850 gData->backTrackSP = gData->backTrackStack;
2851 gData->cursz = 0;
2852 gData->stateStackTop = 0;
2853 cp2 = cp + gData->skipped;
2854 }
2855 return NULL;
2856 }
2859 static REMatchState *
2860 InitMatch(JSContext *cx, REGlobalData *gData, JSRegExp *re)
2861 {
2862 REMatchState *result;
2863 uintN i;
2865 gData->backTrackStackSize = INITIAL_BACKTRACK;
2866 JS_ARENA_ALLOCATE_CAST(gData->backTrackStack, REBackTrackData *,
2867 &gData->pool,
2868 INITIAL_BACKTRACK);
2869 if (!gData->backTrackStack)
2870 return NULL;
2871 gData->backTrackSP = gData->backTrackStack;
2872 gData->cursz = 0;
2875 gData->stateStackLimit = INITIAL_STATESTACK;
2876 JS_ARENA_ALLOCATE_CAST(gData->stateStack, REProgState *,
2877 &gData->pool,
2878 sizeof(REProgState) * INITIAL_STATESTACK);
2879 if (!gData->stateStack)
2880 return NULL;
2881 gData->stateStackTop = 0;
2883 gData->cx = cx;
2884 gData->regexp = re;
2885 gData->ok = JS_TRUE;
2887 JS_ARENA_ALLOCATE_CAST(result, REMatchState *,
2888 &gData->pool,
2889 sizeof(REMatchState)
2890 + (re->parenCount - 1) * sizeof(RECapture));
2891 if (!result)
2892 return NULL;
2894 for (i = 0; i < re->classCount; i++)
2895 if (!re->classList[i].converted)
2896 if (!ProcessCharSet(gData, &re->classList[i]))
2897 return NULL;
2899 return result;
2900 }
2902 JSBool
2903 js_ExecuteRegExp(JSContext *cx, JSRegExp *re, JSString *str, size_t *indexp,
2904 JSBool test, jsval *rval)
2905 {
2906 REGlobalData gData;
2907 REMatchState *x, *result;
2909 const jschar *cp, *ep;
2910 size_t i, length, start;
2911 JSSubString *morepar;
2912 JSBool ok;
2913 JSRegExpStatics *res;
2914 ptrdiff_t matchlen;
2915 uintN num, morenum;
2916 JSString *parstr, *matchstr;
2917 JSObject *obj;
2919 RECapture *parsub = NULL;
2921 /*
2922 * It's safe to load from cp because JSStrings have a zero at the end,
2923 * and we never let cp get beyond cpend.
2924 */
2925 start = *indexp;
2926 length = JSSTRING_LENGTH(str);
2927 if (start > length)
2928 start = length;
2929 cp = JSSTRING_CHARS(str);
2930 gData.cpbegin = cp;
2931 gData.cpend = cp + length;
2932 cp += start;
2933 gData.start = start;
2934 gData.skipped = 0;
2936 JS_InitArenaPool(&gData.pool, "RegExpPool", 8096, 4);
2937 x = InitMatch(cx, &gData, re);
2938 if (!x)
2939 return JS_FALSE;
2940 x->cp = cp;
2942 /*
2943 * Call the recursive matcher to do the real work. Return null on mismatch
2944 * whether testing or not. On match, return an extended Array object.
2945 */
2946 result = MatchRegExp(&gData, x);
2947 ok = gData.ok;
2948 if (!ok)
2949 goto out;
2950 if (!result) {
2951 *rval = JSVAL_NULL;
2952 goto out;
2953 }
2954 cp = result->cp;
2955 i = PTRDIFF(cp, gData.cpbegin, jschar);
2956 *indexp = i;
2957 matchlen = i - (start + gData.skipped);
2958 ep = cp;
2959 cp -= matchlen;
2961 if (test) {
2962 /*
2963 * Testing for a match and updating cx->regExpStatics: don't allocate
2964 * an array object, do return true.
2965 */
2966 *rval = JSVAL_TRUE;
2968 /* Avoid warning. (gcc doesn't detect that obj is needed iff !test); */
2969 obj = NULL;
2970 } else {
2971 /*
2972 * The array returned on match has element 0 bound to the matched
2973 * string, elements 1 through state.parenCount bound to the paren
2974 * matches, an index property telling the length of the left context,
2975 * and an input property referring to the input string.
2976 */
2977 obj = js_NewArrayObject(cx, 0, NULL);
2978 if (!obj) {
2979 ok = JS_FALSE;
2980 goto out;
2981 }
2982 *rval = OBJECT_TO_JSVAL(obj);
2984 #define DEFVAL(val, id) { \
2985 ok = js_DefineProperty(cx, obj, id, val, \
2986 JS_PropertyStub, JS_PropertyStub, \
2987 JSPROP_ENUMERATE, NULL); \
2988 if (!ok) { \
2989 cx->newborn[GCX_OBJECT] = NULL; \
2990 cx->newborn[GCX_STRING] = NULL; \
2991 goto out; \
2992 } \
2993 }
2995 matchstr = js_NewStringCopyN(cx, cp, matchlen, 0);
2996 if (!matchstr) {
2997 cx->newborn[GCX_OBJECT] = NULL;
2998 ok = JS_FALSE;
2999 goto out;
3000 }
3001 DEFVAL(STRING_TO_JSVAL(matchstr), INT_TO_JSVAL(0));
3002 }
3004 res = &cx->regExpStatics;
3005 res->input = str;
3006 res->parenCount = re->parenCount;
3007 if (re->parenCount == 0) {
3008 res->lastParen = js_EmptySubString;
3009 } else {
3010 for (num = 0; num < re->parenCount; num++) {
3011 parsub = &result->parens[num];
3012 if (num < 9) {
3013 if (parsub->index == -1) {
3014 res->parens[num].chars = NULL;
3015 res->parens[num].length = 0;
3016 } else {
3017 res->parens[num].chars = gData.cpbegin + parsub->index;
3018 res->parens[num].length = parsub->length;
3019 }
3020 } else {
3021 morenum = num - 9;
3022 morepar = res->moreParens;
3023 if (!morepar) {
3024 res->moreLength = 10;
3025 morepar = (JSSubString*)
3026 JS_malloc(cx, 10 * sizeof(JSSubString));
3027 } else if (morenum >= res->moreLength) {
3028 res->moreLength += 10;
3029 morepar = (JSSubString*)
3030 JS_realloc(cx, morepar, res->moreLength *
3031 sizeof(JSSubString));
3032 }
3033 if (!morepar) {
3034 cx->newborn[GCX_OBJECT] = NULL;
3035 cx->newborn[GCX_STRING] = NULL;
3036 ok = JS_FALSE;
3037 goto out;
3038 }
3039 res->moreParens = morepar;
3040 if (parsub->index == -1) {
3041 morepar[morenum].chars = NULL;
3042 morepar[morenum].length = 0;
3043 } else {
3044 morepar[morenum].chars = gData.cpbegin + parsub->index;
3045 morepar[morenum].length = parsub->length;
3046 }
3047 }
3048 if (test)
3049 continue;
3050 if (parsub->index == -1) {
3051 ok = js_DefineProperty(cx, obj, INT_TO_JSVAL(num + 1),
3052 JSVAL_VOID, NULL, NULL,
3053 JSPROP_ENUMERATE, NULL);
3054 } else {
3055 parstr = js_NewStringCopyN(cx, gData.cpbegin + parsub->index,
3056 parsub->length, 0);
3057 if (!parstr) {
3058 cx->newborn[GCX_OBJECT] = NULL;
3059 cx->newborn[GCX_STRING] = NULL;
3060 ok = JS_FALSE;
3061 goto out;
3062 }
3063 ok = js_DefineProperty(cx, obj, INT_TO_JSVAL(num + 1),
3064 STRING_TO_JSVAL(parstr), NULL, NULL,
3065 JSPROP_ENUMERATE, NULL);
3066 }
3067 if (!ok) {
3068 cx->newborn[GCX_OBJECT] = NULL;
3069 cx->newborn[GCX_STRING] = NULL;
3070 goto out;
3071 }
3072 }
3073 if (parsub->index == -1) {
3074 res->lastParen = js_EmptySubString;
3075 } else {
3076 res->lastParen.chars = gData.cpbegin + parsub->index;
3077 res->lastParen.length = parsub->length;
3078 }
3079 }
3081 if (!test) {
3082 /*
3083 * Define the index and input properties last for better for/in loop
3084 * order (so they come after the elements).
3085 */
3086 DEFVAL(INT_TO_JSVAL(start + gData.skipped),
3087 (jsid)cx->runtime->atomState.indexAtom);
3088 DEFVAL(STRING_TO_JSVAL(str),
3089 (jsid)cx->runtime->atomState.inputAtom);
3090 }
3092 #undef DEFVAL
3094 res->lastMatch.chars = cp;
3095 res->lastMatch.length = matchlen;
3096 if (cx->version == JSVERSION_1_2) {
3097 /*
3098 * JS1.2 emulated Perl4.0.1.8 (patch level 36) for global regexps used
3099 * in scalar contexts, and unintentionally for the string.match "list"
3100 * psuedo-context. On "hi there bye", the following would result:
3101 *
3102 * Language while(/ /g){print("$`");} s/ /$`/g
3103 * perl4.036 "hi", "there" "hihitherehi therebye"
3104 * perl5 "hi", "hi there" "hihitherehi therebye"
3105 * js1.2 "hi", "there" "hihitheretherebye"
3106 */
3107 res->leftContext.chars = JSSTRING_CHARS(str) + start;
3108 res->leftContext.length = gData.skipped;
3109 } else {
3110 /*
3111 * For JS1.3 and ECMAv2, emulate Perl5 exactly:
3112 *
3113 * js1.3 "hi", "hi there" "hihitherehi therebye"
3114 */
3115 res->leftContext.chars = JSSTRING_CHARS(str);
3116 res->leftContext.length = start + gData.skipped;
3117 }
3118 res->rightContext.chars = ep;
3119 res->rightContext.length = gData.cpend - ep;
3121 out:
3122 JS_FinishArenaPool(&gData.pool);
3123 return ok;
3124 }
3126 /************************************************************************/
3128 enum regexp_tinyid {
3129 REGEXP_SOURCE = -1,
3130 REGEXP_GLOBAL = -2,
3131 REGEXP_IGNORE_CASE = -3,
3132 REGEXP_LAST_INDEX = -4,
3133 REGEXP_MULTILINE = -5
3134 };
3136 #define REGEXP_PROP_ATTRS (JSPROP_PERMANENT|JSPROP_SHARED)
3138 static JSPropertySpec regexp_props[] = {
3139 {"source", REGEXP_SOURCE, REGEXP_PROP_ATTRS | JSPROP_READONLY,0,0},
3140 {"global", REGEXP_GLOBAL, REGEXP_PROP_ATTRS | JSPROP_READONLY,0,0},
3141 {"ignoreCase", REGEXP_IGNORE_CASE, REGEXP_PROP_ATTRS | JSPROP_READONLY,0,0},
3142 {"lastIndex", REGEXP_LAST_INDEX, REGEXP_PROP_ATTRS,0,0},
3143 {"multiline", REGEXP_MULTILINE, REGEXP_PROP_ATTRS | JSPROP_READONLY,0,0},
3144 {0,0,0,0,0}
3145 };
3147 static JSBool
3148 regexp_getProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
3149 {
3150 jsint slot;
3151 JSRegExp *re;
3153 if (!JSVAL_IS_INT(id))
3154 return JS_TRUE;
3155 slot = JSVAL_TO_INT(id);
3156 if (slot == REGEXP_LAST_INDEX)
3157 return JS_GetReservedSlot(cx, obj, 0, vp);
3159 JS_LOCK_OBJ(cx, obj);
3160 re = (JSRegExp *) JS_GetInstancePrivate(cx, obj, &js_RegExpClass, NULL);
3161 if (re) {
3162 switch (slot) {
3163 case REGEXP_SOURCE:
3164 *vp = STRING_TO_JSVAL(re->source);
3165 break;
3166 case REGEXP_GLOBAL:
3167 *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_GLOB) != 0);
3168 break;
3169 case REGEXP_IGNORE_CASE:
3170 *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_FOLD) != 0);
3171 break;
3172 case REGEXP_MULTILINE:
3173 *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_MULTILINE) != 0);
3174 break;
3175 }
3176 }
3177 JS_UNLOCK_OBJ(cx, obj);
3178 return JS_TRUE;
3179 }
3181 static JSBool
3182 regexp_setProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
3183 {
3184 JSBool ok;
3185 jsint slot;
3186 jsdouble lastIndex;
3188 ok = JS_TRUE;
3189 if (!JSVAL_IS_INT(id))
3190 return ok;
3191 slot = JSVAL_TO_INT(id);
3192 if (slot == REGEXP_LAST_INDEX) {
3193 if (!js_ValueToNumber(cx, *vp, &lastIndex))
3194 return JS_FALSE;
3195 lastIndex = js_DoubleToInteger(lastIndex);
3196 ok = js_NewNumberValue(cx, lastIndex, vp) &&
3197 JS_SetReservedSlot(cx, obj, 0, *vp);
3198 }
3199 return ok;
3200 }
3202 /*
3203 * RegExp class static properties and their Perl counterparts:
3204 *
3205 * RegExp.input $_
3206 * RegExp.multiline $*
3207 * RegExp.lastMatch $&
3208 * RegExp.lastParen $+
3209 * RegExp.leftContext $`
3210 * RegExp.rightContext $'
3211 */
3212 enum regexp_static_tinyid {
3213 REGEXP_STATIC_INPUT = -1,
3214 REGEXP_STATIC_MULTILINE = -2,
3215 REGEXP_STATIC_LAST_MATCH = -3,
3216 REGEXP_STATIC_LAST_PAREN = -4,
3217 REGEXP_STATIC_LEFT_CONTEXT = -5,
3218 REGEXP_STATIC_RIGHT_CONTEXT = -6
3219 };
3221 JSBool
3222 js_InitRegExpStatics(JSContext *cx, JSRegExpStatics *res)
3223 {
3224 JS_ClearRegExpStatics(cx);
3225 return js_AddRoot(cx, &res->input, "res->input");
3226 }
3228 void
3229 js_FreeRegExpStatics(JSContext *cx, JSRegExpStatics *res)
3230 {
3231 if (res->moreParens) {
3232 JS_free(cx, res->moreParens);
3233 res->moreParens = NULL;
3234 }
3235 js_RemoveRoot(cx->runtime, &res->input);
3236 }
3238 static JSBool
3239 regexp_static_getProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
3240 {
3241 jsint slot;
3242 JSRegExpStatics *res;
3243 JSString *str;
3244 JSSubString *sub;
3246 res = &cx->regExpStatics;
3247 if (!JSVAL_IS_INT(id))
3248 return JS_TRUE;
3249 slot = JSVAL_TO_INT(id);
3250 switch (slot) {
3251 case REGEXP_STATIC_INPUT:
3252 *vp = res->input ? STRING_TO_JSVAL(res->input)
3253 : JS_GetEmptyStringValue(cx);
3254 return JS_TRUE;
3255 case REGEXP_STATIC_MULTILINE:
3256 *vp = BOOLEAN_TO_JSVAL(res->multiline);
3257 return JS_TRUE;
3258 case REGEXP_STATIC_LAST_MATCH:
3259 sub = &res->lastMatch;
3260 break;
3261 case REGEXP_STATIC_LAST_PAREN:
3262 sub = &res->lastParen;
3263 break;
3264 case REGEXP_STATIC_LEFT_CONTEXT:
3265 sub = &res->leftContext;
3266 break;
3267 case REGEXP_STATIC_RIGHT_CONTEXT:
3268 sub = &res->rightContext;
3269 break;
3270 default:
3271 sub = REGEXP_PAREN_SUBSTRING(res, slot);
3272 break;
3273 }
3274 str = js_NewStringCopyN(cx, sub->chars, sub->length, 0);
3275 if (!str)
3276 return JS_FALSE;
3277 *vp = STRING_TO_JSVAL(str);
3278 return JS_TRUE;
3279 }
3281 static JSBool
3282 regexp_static_setProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
3283 {
3284 JSRegExpStatics *res;
3286 if (!JSVAL_IS_INT(id))
3287 return JS_TRUE;
3288 res = &cx->regExpStatics;
3289 /* XXX use if-else rather than switch to keep MSVC1.52 from crashing */
3290 if (JSVAL_TO_INT(id) == REGEXP_STATIC_INPUT) {
3291 if (!JSVAL_IS_STRING(*vp) &&
3292 !JS_ConvertValue(cx, *vp, JSTYPE_STRING, vp)) {
3293 return JS_FALSE;
3294 }
3295 res->input = JSVAL_TO_STRING(*vp);
3296 } else if (JSVAL_TO_INT(id) == REGEXP_STATIC_MULTILINE) {
3297 if (!JSVAL_IS_BOOLEAN(*vp) &&
3298 !JS_ConvertValue(cx, *vp, JSTYPE_BOOLEAN, vp)) {
3299 return JS_FALSE;
3300 }
3301 res->multiline = JSVAL_TO_BOOLEAN(*vp);
3302 }
3303 return JS_TRUE;
3304 }
3306 static JSPropertySpec regexp_static_props[] = {
3307 {"input",
3308 REGEXP_STATIC_INPUT,
3309 JSPROP_ENUMERATE|JSPROP_SHARED,
3310 regexp_static_getProperty, regexp_static_setProperty},
3311 {"multiline",
3312 REGEXP_STATIC_MULTILINE,
3313 JSPROP_ENUMERATE|JSPROP_SHARED,
3314 regexp_static_getProperty, regexp_static_setProperty},
3315 {"lastMatch",
3316 REGEXP_STATIC_LAST_MATCH,
3317 JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3318 regexp_static_getProperty, regexp_static_getProperty},
3319 {"lastParen",
3320 REGEXP_STATIC_LAST_PAREN,
3321 JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3322 regexp_static_getProperty, regexp_static_getProperty},
3323 {"leftContext",
3324 REGEXP_STATIC_LEFT_CONTEXT,
3325 JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3326 regexp_static_getProperty, regexp_static_getProperty},
3327 {"rightContext",
3328 REGEXP_STATIC_RIGHT_CONTEXT,
3329 JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3330 regexp_static_getProperty, regexp_static_getProperty},
3332 /* XXX should have block scope and local $1, etc. */
3333 {"$1", 0, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3334 regexp_static_getProperty, regexp_static_getProperty},
3335 {"$2", 1, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3336 regexp_static_getProperty, regexp_static_getProperty},
3337 {"$3", 2, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3338 regexp_static_getProperty, regexp_static_getProperty},
3339 {"$4", 3, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3340 regexp_static_getProperty, regexp_static_getProperty},
3341 {"$5", 4, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3342 regexp_static_getProperty, regexp_static_getProperty},
3343 {"$6", 5, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3344 regexp_static_getProperty, regexp_static_getProperty},
3345 {"$7", 6, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3346 regexp_static_getProperty, regexp_static_getProperty},
3347 {"$8", 7, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3348 regexp_static_getProperty, regexp_static_getProperty},
3349 {"$9", 8, JSPROP_ENUMERATE|JSPROP_READONLY|JSPROP_SHARED,
3350 regexp_static_getProperty, regexp_static_getProperty},
3352 {0,0,0,0,0}
3353 };
3355 static void
3356 regexp_finalize(JSContext *cx, JSObject *obj)
3357 {
3358 JSRegExp *re;
3360 re = (JSRegExp *) JS_GetPrivate(cx, obj);
3361 if (!re)
3362 return;
3363 js_DestroyRegExp(cx, re);
3364 }
3366 /* Forward static prototype. */
3367 static JSBool
3368 regexp_exec(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
3369 jsval *rval);
3371 static JSBool
3372 regexp_call(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval)
3373 {
3374 return regexp_exec(cx, JSVAL_TO_OBJECT(argv[-2]), argc, argv, rval);
3375 }
3377 #if JS_HAS_XDR
3379 #include "jsxdrapi.h"
3381 static JSBool
3382 regexp_xdrObject(JSXDRState *xdr, JSObject **objp)
3383 {
3384 JSRegExp *re;
3385 JSString *source;
3386 uint32 flagsword;
3387 JSObject *obj;
3389 if (xdr->mode == JSXDR_ENCODE) {
3390 re = (JSRegExp *) JS_GetPrivate(xdr->cx, *objp);
3391 if (!re)
3392 return JS_FALSE;
3393 source = re->source;
3394 flagsword = ((uint32)re->cloneIndex << 16) | re->flags;
3395 }
3396 if (!JS_XDRString(xdr, &source) ||
3397 !JS_XDRUint32(xdr, &flagsword)) {
3398 return JS_FALSE;
3399 }
3400 if (xdr->mode == JSXDR_DECODE) {
3401 obj = js_NewObject(xdr->cx, &js_RegExpClass, NULL, NULL);
3402 if (!obj)
3403 return JS_FALSE;
3404 re = js_NewRegExp(xdr->cx, NULL, source, (uint16)flagsword, JS_FALSE);
3405 if (!re)
3406 return JS_FALSE;
3407 if (!JS_SetPrivate(xdr->cx, obj, re) ||
3408 !js_SetLastIndex(xdr->cx, obj, 0)) {
3409 js_DestroyRegExp(xdr->cx, re);
3410 return JS_FALSE;
3411 }
3412 re->cloneIndex = (uint16)(flagsword >> 16);
3413 *objp = obj;
3414 }
3415 return JS_TRUE;
3416 }
3418 #else /* !JS_HAS_XDR */
3420 #define regexp_xdrObject NULL
3422 #endif /* !JS_HAS_XDR */
3424 static uint32
3425 regexp_mark(JSContext *cx, JSObject *obj, void *arg)
3426 {
3427 JSRegExp *re = (JSRegExp *) JS_GetPrivate(cx, obj);
3428 if (re)
3429 JS_MarkGCThing(cx, re->source, "source", arg);
3430 return 0;
3431 }
3433 JSClass js_RegExpClass = {
3434 js_RegExp_str,
3435 JSCLASS_HAS_PRIVATE | JSCLASS_HAS_RESERVED_SLOTS(1),
3436 JS_PropertyStub, JS_PropertyStub, regexp_getProperty, regexp_setProperty,
3437 JS_EnumerateStub, JS_ResolveStub, JS_ConvertStub, regexp_finalize,
3438 NULL, NULL, regexp_call, NULL,
3439 regexp_xdrObject, NULL, regexp_mark, 0
3440 };
3442 static const jschar empty_regexp_ucstr[] = {'(', '?', ':', ')', 0};
3444 JSBool
3445 js_regexp_toString(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
3446 jsval *rval)
3447 {
3448 JSRegExp *re;
3449 const jschar *source;
3450 jschar *chars;
3451 size_t length, nflags;
3452 uintN flags;
3453 JSString *str;
3455 if (!JS_InstanceOf(cx, obj, &js_RegExpClass, argv))
3456 return JS_FALSE;
3457 JS_LOCK_OBJ(cx, obj);
3458 re = (JSRegExp *) JS_GetPrivate(cx, obj);
3459 if (!re) {
3460 JS_UNLOCK_OBJ(cx, obj);
3461 *rval = STRING_TO_JSVAL(cx->runtime->emptyString);
3462 return JS_TRUE;
3463 }
3465 source = JSSTRING_CHARS(re->source);
3466 length = JSSTRING_LENGTH(re->source);
3467 if (length == 0) {
3468 source = empty_regexp_ucstr;
3469 length = sizeof(empty_regexp_ucstr) / sizeof(jschar) - 1;
3470 }
3471 length += 2;
3472 nflags = 0;
3473 for (flags = re->flags; flags != 0; flags &= flags - 1)
3474 nflags++;
3475 chars = (jschar*) JS_malloc(cx, (length + nflags + 1) * sizeof(jschar));
3476 if (!chars) {
3477 JS_UNLOCK_OBJ(cx, obj);
3478 return JS_FALSE;
3479 }
3481 chars[0] = '/';
3482 js_strncpy(&chars[1], source, length - 2);
3483 chars[length-1] = '/';
3484 if (nflags) {
3485 if (re->flags & JSREG_GLOB)
3486 chars[length++] = 'g';
3487 if (re->flags & JSREG_FOLD)
3488 chars[length++] = 'i';
3489 if (re->flags & JSREG_MULTILINE)
3490 chars[length++] = 'm';
3491 }
3492 JS_UNLOCK_OBJ(cx, obj);
3493 chars[length] = 0;
3495 str = js_NewString(cx, chars, length, 0);
3496 if (!str) {
3497 JS_free(cx, chars);
3498 return JS_FALSE;
3499 }
3500 *rval = STRING_TO_JSVAL(str);
3501 return JS_TRUE;
3502 }
3504 static JSBool
3505 regexp_compile(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
3506 jsval *rval)
3507 {
3508 JSString *opt, *str;
3509 JSRegExp *oldre, *re;
3510 JSBool ok, ok2;
3511 JSObject *obj2;
3512 size_t length, nbytes;
3513 const jschar *cp, *start, *end;
3514 jschar *nstart, *ncp, *tmp;
3516 if (!JS_InstanceOf(cx, obj, &js_RegExpClass, argv))
3517 return JS_FALSE;
3518 opt = NULL;
3519 if (argc == 0) {
3520 str = cx->runtime->emptyString;
3521 } else {
3522 if (JSVAL_IS_OBJECT(argv[0])) {
3523 /*
3524 * If we get passed in a RegExp object we construct a new
3525 * RegExp that is a duplicate of it by re-compiling the
3526 * original source code. ECMA requires that it be an error
3527 * here if the flags are specified. (We must use the flags
3528 * from the original RegExp also).
3529 */
3530 obj2 = JSVAL_TO_OBJECT(argv[0]);
3531 if (obj2 && OBJ_GET_CLASS(cx, obj2) == &js_RegExpClass) {
3532 if (argc >= 2 && !JSVAL_IS_VOID(argv[1])) { /* 'flags' passed */
3533 JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL,
3534 JSMSG_NEWREGEXP_FLAGGED);
3535 return JS_FALSE;
3536 }
3537 JS_LOCK_OBJ(cx, obj2);
3538 re = (JSRegExp *) JS_GetPrivate(cx, obj2);
3539 if (!re) {
3540 JS_UNLOCK_OBJ(cx, obj2);
3541 return JS_FALSE;
3542 }
3543 re = js_NewRegExp(cx, NULL, re->source, re->flags, JS_FALSE);
3544 JS_UNLOCK_OBJ(cx, obj2);
3545 goto created;
3546 }
3547 }
3548 str = js_ValueToString(cx, argv[0]);
3549 if (!str)
3550 return JS_FALSE;
3551 argv[0] = STRING_TO_JSVAL(str);
3552 if (argc > 1) {
3553 if (JSVAL_IS_VOID(argv[1])) {
3554 opt = NULL;
3555 } else {
3556 opt = js_ValueToString(cx, argv[1]);
3557 if (!opt)
3558 return JS_FALSE;
3559 argv[1] = STRING_TO_JSVAL(opt);
3560 }
3561 }
3563 /* Escape any naked slashes in the regexp source. */
3564 length = JSSTRING_LENGTH(str);
3565 start = JSSTRING_CHARS(str);
3566 end = start + length;
3567 nstart = ncp = NULL;
3568 for (cp = start; cp < end; cp++) {
3569 if (*cp == '/' && (cp == start || cp[-1] != '\\')) {
3570 nbytes = (++length + 1) * sizeof(jschar);
3571 if (!nstart) {
3572 nstart = (jschar *) JS_malloc(cx, nbytes);
3573 if (!nstart)
3574 return JS_FALSE;
3575 ncp = nstart + (cp - start);
3576 js_strncpy(nstart, start, cp - start);
3577 } else {
3578 tmp = (jschar *) JS_realloc(cx, nstart, nbytes);
3579 if (!tmp) {
3580 JS_free(cx, nstart);
3581 return JS_FALSE;
3582 }
3583 ncp = tmp + (ncp - nstart);
3584 nstart = tmp;
3585 }
3586 *ncp++ = '\\';
3587 }
3588 if (nstart)
3589 *ncp++ = *cp;
3590 }
3592 if (nstart) {
3593 /* Don't forget to store the backstop after the new string. */
3594 JS_ASSERT((size_t)(ncp - nstart) == length);
3595 *ncp = 0;
3596 str = js_NewString(cx, nstart, length, 0);
3597 if (!str) {
3598 JS_free(cx, nstart);
3599 return JS_FALSE;
3600 }
3601 argv[0] = STRING_TO_JSVAL(str);
3602 }
3603 }
3605 re = js_NewRegExpOpt(cx, NULL, str, opt, JS_FALSE);
3606 created:
3607 if (!re)
3608 return JS_FALSE;
3609 JS_LOCK_OBJ(cx, obj);
3610 oldre = (JSRegExp *) JS_GetPrivate(cx, obj);
3611 ok = JS_SetPrivate(cx, obj, re);
3612 ok2 = js_SetLastIndex(cx, obj, 0);
3613 JS_UNLOCK_OBJ(cx, obj);
3614 if (!ok) {
3615 js_DestroyRegExp(cx, re);
3616 return JS_FALSE;
3617 }
3618 if (oldre)
3619 js_DestroyRegExp(cx, oldre);
3620 *rval = OBJECT_TO_JSVAL(obj);
3621 return ok2;
3622 }
3624 static JSBool
3625 regexp_exec_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
3626 JSBool test, jsval *rval)
3627 {
3628 JSBool ok;
3629 JSRegExp *re;
3630 jsdouble lastIndex;
3631 JSString *str;
3632 size_t i;
3634 ok = JS_InstanceOf(cx, obj, &js_RegExpClass, argv);
3635 if (!ok)
3636 return JS_FALSE;
3637 JS_LOCK_OBJ(cx, obj);
3638 re = (JSRegExp *) JS_GetPrivate(cx, obj);
3639 if (!re) {
3640 JS_UNLOCK_OBJ(cx, obj);
3641 return JS_TRUE;
3642 }
3644 /* NB: we must reach out: after this paragraph, in order to drop re. */
3645 HOLD_REGEXP(cx, re);
3646 if (re->flags & JSREG_GLOB) {
3647 ok = js_GetLastIndex(cx, obj, &lastIndex);
3648 } else {
3649 lastIndex = 0;
3650 }
3651 JS_UNLOCK_OBJ(cx, obj);
3652 if (!ok)
3653 goto out;
3655 /* Now that obj is unlocked, it's safe to (potentially) grab the GC lock. */
3656 if (argc == 0) {
3657 str = cx->regExpStatics.input;
3658 if (!str) {
3659 JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL,
3660 JSMSG_NO_INPUT,
3661 JS_GetStringBytes(re->source),
3662 (re->flags & JSREG_GLOB) ? "g" : "",
3663 (re->flags & JSREG_FOLD) ? "i" : "",
3664 (re->flags & JSREG_MULTILINE) ? "m" : "");
3665 ok = JS_FALSE;
3666 goto out;
3667 }
3668 } else {
3669 str = js_ValueToString(cx, argv[0]);
3670 if (!str)
3671 goto out;
3672 argv[0] = STRING_TO_JSVAL(str);
3673 }
3675 if (lastIndex < 0 || JSSTRING_LENGTH(str) < lastIndex) {
3676 ok = js_SetLastIndex(cx, obj, 0);
3677 *rval = JSVAL_NULL;
3678 } else {
3679 i = (size_t) lastIndex;
3680 ok = js_ExecuteRegExp(cx, re, str, &i, test, rval);
3681 if (ok && (re->flags & JSREG_GLOB))
3682 ok = js_SetLastIndex(cx, obj, (*rval == JSVAL_NULL) ? 0 : i);
3683 }
3685 out:
3686 DROP_REGEXP(cx, re);
3687 return ok;
3688 }
3690 static JSBool
3691 regexp_exec(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval)
3692 {
3693 return regexp_exec_sub(cx, obj, argc, argv, JS_FALSE, rval);
3694 }
3696 static JSBool
3697 regexp_test(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval)
3698 {
3699 if (!regexp_exec_sub(cx, obj, argc, argv, JS_TRUE, rval))
3700 return JS_FALSE;
3701 if (*rval != JSVAL_TRUE)
3702 *rval = JSVAL_FALSE;
3703 return JS_TRUE;
3704 }
3706 static JSFunctionSpec regexp_methods[] = {
3707 #if JS_HAS_TOSOURCE
3708 {js_toSource_str, js_regexp_toString, 0,0,0},
3709 #endif
3710 {js_toString_str, js_regexp_toString, 0,0,0},
3711 {"compile", regexp_compile, 1,0,0},
3712 {"exec", regexp_exec, 0,0,0},
3713 {"test", regexp_test, 0,0,0},
3714 {0,0,0,0,0}
3715 };
3717 static JSBool
3718 RegExp(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval)
3719 {
3720 if (!(cx->fp->flags & JSFRAME_CONSTRUCTING)) {
3721 /*
3722 * If first arg is regexp and no flags are given, just return the arg.
3723 * (regexp_compile detects the regexp + flags case and throws a
3724 * TypeError.) See 10.15.3.1.
3725 */
3726 if ((argc < 2 || JSVAL_IS_VOID(argv[1])) &&
3727 JSVAL_IS_OBJECT(argv[0]) &&
3728 OBJ_GET_CLASS(cx, JSVAL_TO_OBJECT(argv[0])) == &js_RegExpClass) {
3729 *rval = argv[0];
3730 return JS_TRUE;
3731 }
3733 /* Otherwise, replace obj with a new RegExp object. */
3734 obj = js_NewObject(cx, &js_RegExpClass, NULL, NULL);
3735 if (!obj)
3736 return JS_FALSE;
3737 }
3738 return regexp_compile(cx, obj, argc, argv, rval);
3739 }
3741 JSObject *
3742 js_InitRegExpClass(JSContext *cx, JSObject *obj)
3743 {
3744 JSObject *proto, *ctor;
3745 jsval rval;
3747 proto = JS_InitClass(cx, obj, NULL, &js_RegExpClass, RegExp, 1,
3748 regexp_props, regexp_methods,
3749 regexp_static_props, NULL);
3751 if (!proto || !(ctor = JS_GetConstructor(cx, proto)))
3752 return NULL;
3753 if (!JS_AliasProperty(cx, ctor, "input", "$_") ||
3754 !JS_AliasProperty(cx, ctor, "multiline", "$*") ||
3755 !JS_AliasProperty(cx, ctor, "lastMatch", "$&") ||
3756 !JS_AliasProperty(cx, ctor, "lastParen", "$+") ||
3757 !JS_AliasProperty(cx, ctor, "leftContext", "$`") ||
3758 !JS_AliasProperty(cx, ctor, "rightContext", "$'")) {
3759 goto bad;
3760 }
3762 /* Give RegExp.prototype private data so it matches the empty string. */
3763 if (!regexp_compile(cx, proto, 0, NULL, &rval))
3764 goto bad;
3765 return proto;
3767 bad:
3768 JS_DeleteProperty(cx, obj, js_RegExpClass.name);
3769 return NULL;
3770 }
3772 JSObject *
3773 js_NewRegExpObject(JSContext *cx, JSTokenStream *ts,
3774 jschar *chars, size_t length, uintN flags)
3775 {
3776 JSString *str;
3777 JSObject *obj;
3778 JSRegExp *re;
3780 str = js_NewStringCopyN(cx, chars, length, 0);
3781 if (!str)
3782 return NULL;
3783 re = js_NewRegExp(cx, ts, str, flags, JS_FALSE);
3784 if (!re)
3785 return NULL;
3786 obj = js_NewObject(cx, &js_RegExpClass, NULL, NULL);
3787 if (!obj || !JS_SetPrivate(cx, obj, re) || !js_SetLastIndex(cx, obj, 0)) {
3788 js_DestroyRegExp(cx, re);
3789 return NULL;
3790 }
3791 return obj;
3792 }
3794 JSObject *
3795 js_CloneRegExpObject(JSContext *cx, JSObject *obj, JSObject *parent)
3796 {
3797 JSObject *clone;
3798 JSRegExp *re;
3800 JS_ASSERT(OBJ_GET_CLASS(cx, obj) == &js_RegExpClass);
3801 clone = js_NewObject(cx, &js_RegExpClass, NULL, parent);
3802 if (!clone)
3803 return NULL;
3804 re = JS_GetPrivate(cx, obj);
3805 if (!JS_SetPrivate(cx, clone, re) || !js_SetLastIndex(cx, clone, 0)) {
3806 cx->newborn[GCX_OBJECT] = NULL;
3807 return NULL;
3808 }
3809 HOLD_REGEXP(cx, re);
3810 return clone;
3811 }
3813 JSBool
3814 js_GetLastIndex(JSContext *cx, JSObject *obj, jsdouble *lastIndex)
3815 {
3816 jsval v;
3818 return JS_GetReservedSlot(cx, obj, 0, &v) &&
3819 js_ValueToNumber(cx, v, lastIndex);
3820 }
3822 JSBool
3823 js_SetLastIndex(JSContext *cx, JSObject *obj, jsdouble lastIndex)
3824 {
3825 jsval v;
3827 return js_NewNumberValue(cx, lastIndex, &v) &&
3828 JS_SetReservedSlot(cx, obj, 0, v);
3829 }
3831 #endif /* JS_HAS_REGEXPS */