diff options
Diffstat (limited to 'libc/misc/regex')
-rw-r--r-- | libc/misc/regex/regex.c | 2257 |
1 files changed, 1128 insertions, 1129 deletions
diff --git a/libc/misc/regex/regex.c b/libc/misc/regex/regex.c index d14595dfd..350535fa1 100644 --- a/libc/misc/regex/regex.c +++ b/libc/misc/regex/regex.c @@ -1832,1139 +1832,11 @@ int num_regs; #endif /* not MATCH_MAY_ALLOCATE */ -static boolean group_in_compile_stack _RE_ARGS((compile_stack_type - compile_stack, - - regnum_t regnum)); - -/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. - Returns one of error codes defined in `regex.h', or zero for success. - - Assumes the `allocated' (and perhaps `buffer') and `translate' - fields are set in BUFP on entry. - - If it succeeds, results are put in BUFP (if it returns an error, the - contents of BUFP are undefined): - `buffer' is the compiled pattern; - `syntax' is set to SYNTAX; - `used' is set to the length of the compiled pattern; - `fastmap_accurate' is zero; - `re_nsub' is the number of subexpressions in PATTERN; - `not_bol' and `not_eol' are zero; - - The `fastmap' and `newline_anchor' fields are neither - examined nor set. */ - -/* Return, freeing storage we allocated. */ -#define FREE_STACK_RETURN(value) \ - return (free (compile_stack.stack), value) - -static reg_errcode_t regex_compile(pattern, size, syntax, bufp) -const char *pattern; -size_t size; -reg_syntax_t syntax; -struct re_pattern_buffer *bufp; -{ - /* We fetch characters from PATTERN here. Even though PATTERN is - `char *' (i.e., signed), we declare these variables as unsigned, so - they can be reliably used as array indices. */ - register unsigned char c, c1; - - /* A random temporary spot in PATTERN. */ - const char *p1; - - /* Points to the end of the buffer, where we should append. */ - register unsigned char *b; - - /* Keeps track of unclosed groups. */ - compile_stack_type compile_stack; - - /* Points to the current (ending) position in the pattern. */ - const char *p = pattern; - const char *pend = pattern + size; - - /* How to translate the characters in the pattern. */ - RE_TRANSLATE_TYPE translate = bufp->translate; - - /* Address of the count-byte of the most recently inserted `exactn' - command. This makes it possible to tell if a new exact-match - character can be added to that command or if the character requires - a new `exactn' command. */ - unsigned char *pending_exact = 0; - - /* Address of start of the most recently finished expression. - This tells, e.g., postfix * where to find the start of its - operand. Reset at the beginning of groups and alternatives. */ - unsigned char *laststart = 0; - - /* Address of beginning of regexp, or inside of last group. */ - unsigned char *begalt; - - /* Place in the uncompiled pattern (i.e., the {) to - which to go back if the interval is invalid. */ - const char *beg_interval; - - /* Address of the place where a forward jump should go to the end of - the containing expression. Each alternative of an `or' -- except the - last -- ends with a forward jump of this sort. */ - unsigned char *fixup_alt_jump = 0; - - /* Counts open-groups as they are encountered. Remembered for the - matching close-group on the compile stack, so the same register - number is put in the stop_memory as the start_memory. */ - regnum_t regnum = 0; - -#ifdef DEBUG - DEBUG_PRINT1("\nCompiling pattern: "); - if (debug) { - unsigned debug_count; - - for (debug_count = 0; debug_count < size; debug_count++) - putchar(pattern[debug_count]); - putchar('\n'); - } -#endif /* DEBUG */ - - /* Initialize the compile stack. */ - compile_stack.stack = - TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); - if (compile_stack.stack == NULL) - return REG_ESPACE; - - compile_stack.size = INIT_COMPILE_STACK_SIZE; - compile_stack.avail = 0; - - /* Initialize the pattern buffer. */ - bufp->syntax = syntax; - bufp->fastmap_accurate = 0; - bufp->not_bol = bufp->not_eol = 0; - - /* Set `used' to zero, so that if we return an error, the pattern - printer (for debugging) will think there's no pattern. We reset it - at the end. */ - bufp->used = 0; - - /* Always count groups, whether or not bufp->no_sub is set. */ - bufp->re_nsub = 0; - -#if !defined emacs && !defined SYNTAX_TABLE - /* Initialize the syntax table. */ - init_syntax_once(); -#endif - - if (bufp->allocated == 0) { - if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc - enough space. This loses if buffer's address is bogus, but - that is the user's responsibility. */ - RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char); - } else { /* Caller did not allocate a buffer. Do it for them. */ - bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char); - } - if (!bufp->buffer) - FREE_STACK_RETURN(REG_ESPACE); - - bufp->allocated = INIT_BUF_SIZE; - } - - begalt = b = bufp->buffer; - - /* Loop through the uncompiled pattern until we're at the end. */ - while (p != pend) { - PATFETCH(c); - - switch (c) { - case '^': - { - if ( /* If at start of pattern, it's an operator. */ - p == pattern + 1 - /* If context independent, it's an operator. */ - || syntax & RE_CONTEXT_INDEP_ANCHORS - /* Otherwise, depends on what's come before. */ - || at_begline_loc_p(pattern, p, syntax)) - BUF_PUSH(begline); - else - goto normal_char; - } - break; - - - case '$': - { - if ( /* If at end of pattern, it's an operator. */ - p == pend - /* If context independent, it's an operator. */ - || syntax & RE_CONTEXT_INDEP_ANCHORS - /* Otherwise, depends on what's next. */ - || at_endline_loc_p(p, pend, syntax)) - BUF_PUSH(endline); - else - goto normal_char; - } - break; - - - case '+': - case '?': - if ((syntax & RE_BK_PLUS_QM) - || (syntax & RE_LIMITED_OPS)) - goto normal_char; - handle_plus: - case '*': - /* If there is no previous pattern... */ - if (!laststart) { - if (syntax & RE_CONTEXT_INVALID_OPS) - FREE_STACK_RETURN(REG_BADRPT); - else if (!(syntax & RE_CONTEXT_INDEP_OPS)) - goto normal_char; - } - - { - /* Are we optimizing this jump? */ - boolean keep_string_p = false; - - /* 1 means zero (many) matches is allowed. */ - char zero_times_ok = 0, many_times_ok = 0; - - /* If there is a sequence of repetition chars, collapse it - down to just one (the right one). We can't combine - interval operators with these because of, e.g., `a{2}*', - which should only match an even number of `a's. */ - - for (;;) { - zero_times_ok |= c != '+'; - many_times_ok |= c != '?'; - - if (p == pend) - break; - - PATFETCH(c); - - if (c == '*' - || (!(syntax & RE_BK_PLUS_QM) - && (c == '+' || c == '?'))); - - else if (syntax & RE_BK_PLUS_QM && c == '\\') { - if (p == pend) - FREE_STACK_RETURN(REG_EESCAPE); - - PATFETCH(c1); - if (!(c1 == '+' || c1 == '?')) { - PATUNFETCH; - PATUNFETCH; - break; - } - - c = c1; - } else { - PATUNFETCH; - break; - } - - /* If we get here, we found another repeat character. */ - } - - /* Star, etc. applied to an empty pattern is equivalent - to an empty pattern. */ - if (!laststart) - break; - - /* Now we know whether or not zero matches is allowed - and also whether or not two or more matches is allowed. */ - if (many_times_ok) { /* More than one repetition is allowed, so put in at the - end a backward relative jump from `b' to before the next - jump we're going to put in below (which jumps from - laststart to after this jump). - - But if we are at the `*' in the exact sequence `.*\n', - insert an unconditional jump backwards to the ., - instead of the beginning of the loop. This way we only - push a failure point once, instead of every time - through the loop. */ - assert(p - 1 > pattern); - - /* Allocate the space for the jump. */ - GET_BUFFER_SPACE(3); - - /* We know we are not at the first character of the pattern, - because laststart was nonzero. And we've already - incremented `p', by the way, to be the character after - the `*'. Do we have to do something analogous here - for null bytes, because of RE_DOT_NOT_NULL? */ - if (TRANSLATE(*(p - 2)) == TRANSLATE('.') - && zero_times_ok - && p < pend && TRANSLATE(*p) == TRANSLATE('\n') - && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ - STORE_JUMP(jump, b, laststart); - keep_string_p = true; - } else - /* Anything else. */ - STORE_JUMP(maybe_pop_jump, b, laststart - 3); - - /* We've added more stuff to the buffer. */ - b += 3; - } - - /* On failure, jump from laststart to b + 3, which will be the - end of the buffer after this jump is inserted. */ - GET_BUFFER_SPACE(3); - INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump - : on_failure_jump, laststart, b + 3); - pending_exact = 0; - b += 3; - - if (!zero_times_ok) { - /* At least one repetition is required, so insert a - `dummy_failure_jump' before the initial - `on_failure_jump' instruction of the loop. This - effects a skip over that instruction the first time - we hit that loop. */ - GET_BUFFER_SPACE(3); - INSERT_JUMP(dummy_failure_jump, laststart, - laststart + 6); - b += 3; - } - } - break; - - - case '.': - laststart = b; - BUF_PUSH(anychar); - break; - - - case '[': - { - boolean had_char_class = false; - - if (p == pend) - FREE_STACK_RETURN(REG_EBRACK); - - /* Ensure that we have enough space to push a charset: the - opcode, the length count, and the bitset; 34 bytes in all. */ - GET_BUFFER_SPACE(34); - - laststart = b; - - /* We test `*p == '^' twice, instead of using an if - statement, so we only need one BUF_PUSH. */ - BUF_PUSH(*p == '^' ? charset_not : charset); - if (*p == '^') - p++; - - /* Remember the first position in the bracket expression. */ - p1 = p; - - /* Push the number of bytes in the bitmap. */ - BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH); - - /* Clear the whole map. */ - bzero(b, (1 << BYTEWIDTH) / BYTEWIDTH); - - /* charset_not matches newline according to a syntax bit. */ - if ((re_opcode_t) b[-2] == charset_not - && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT('\n'); - - /* Read in characters and ranges, setting map bits. */ - for (;;) { - if (p == pend) - FREE_STACK_RETURN(REG_EBRACK); - - PATFETCH(c); - - /* \ might escape characters inside [...] and [^...]. */ - if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') { - if (p == pend) - FREE_STACK_RETURN(REG_EESCAPE); - - PATFETCH(c1); - SET_LIST_BIT(c1); - continue; - } - - /* Could be the end of the bracket expression. If it's - not (i.e., when the bracket expression is `[]' so - far), the ']' character bit gets set way below. */ - if (c == ']' && p != p1 + 1) - break; - - /* Look ahead to see if it's a range when the last thing - was a character class. */ - if (had_char_class && c == '-' && *p != ']') - FREE_STACK_RETURN(REG_ERANGE); - - /* Look ahead to see if it's a range when the last thing - was a character: if this is a hyphen not at the - beginning or the end of a list, then it's the range - operator. */ - if (c == '-' && !(p - 2 >= pattern && p[-2] == '[') - && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') - && *p != ']') { - reg_errcode_t ret - = compile_range(&p, pend, translate, syntax, b); - - if (ret != REG_NOERROR) - FREE_STACK_RETURN(ret); - } - - else if (p[0] == '-' && p[1] != ']') { /* This handles ranges made up of characters only. */ - reg_errcode_t ret; - - /* Move past the `-'. */ - PATFETCH(c1); - - ret = compile_range(&p, pend, translate, syntax, b); - if (ret != REG_NOERROR) - FREE_STACK_RETURN(ret); - } - - /* See if we're at the beginning of a possible character - class. */ - - else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */ - char str[CHAR_CLASS_MAX_LENGTH + 1]; - - PATFETCH(c); - c1 = 0; - - /* If pattern is `[[:'. */ - if (p == pend) - FREE_STACK_RETURN(REG_EBRACK); - - for (;;) { - PATFETCH(c); - if ((c == ':' && *p == ']') || p == pend) - break; - if (c1 < CHAR_CLASS_MAX_LENGTH) - str[c1++] = c; - else - /* This is in any case an invalid class name. */ - str[0] = '\0'; - } - str[c1] = '\0'; - - /* If isn't a word bracketed by `[:' and `:]': - undo the ending character, the letters, and leave - the leading `:' and `[' (but set bits for them). */ - if (c == ':' && *p == ']') { -#if defined _LIBC || WIDE_CHAR_SUPPORT - boolean is_lower = STREQ(str, "lower"); - boolean is_upper = STREQ(str, "upper"); - wctype_t wt; - int ch; - - wt = IS_CHAR_CLASS(str); - if (wt == 0) - FREE_STACK_RETURN(REG_ECTYPE); - - /* Throw away the ] at the end of the character - class. */ - PATFETCH(c); - - if (p == pend) - FREE_STACK_RETURN(REG_EBRACK); - - for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) { -# ifdef _LIBC - if (__iswctype(__btowc(ch), wt)) - SET_LIST_BIT(ch); -# else - if (iswctype(btowc(ch), wt)) - SET_LIST_BIT(ch); -# endif - - if (translate && (is_upper || is_lower) - && (ISUPPER(ch) || ISLOWER(ch))) - SET_LIST_BIT(ch); - } - - had_char_class = true; -#else - int ch; - boolean is_alnum = STREQ(str, "alnum"); - boolean is_alpha = STREQ(str, "alpha"); - boolean is_blank = STREQ(str, "blank"); - boolean is_cntrl = STREQ(str, "cntrl"); - boolean is_digit = STREQ(str, "digit"); - boolean is_graph = STREQ(str, "graph"); - boolean is_lower = STREQ(str, "lower"); - boolean is_print = STREQ(str, "print"); - boolean is_punct = STREQ(str, "punct"); - boolean is_space = STREQ(str, "space"); - boolean is_upper = STREQ(str, "upper"); - boolean is_xdigit = STREQ(str, "xdigit"); - - if (!IS_CHAR_CLASS(str)) - FREE_STACK_RETURN(REG_ECTYPE); - - /* Throw away the ] at the end of the character - class. */ - PATFETCH(c); - - if (p == pend) - FREE_STACK_RETURN(REG_EBRACK); - - for (ch = 0; ch < 1 << BYTEWIDTH; ch++) { - /* This was split into 3 if's to - avoid an arbitrary limit in some compiler. */ - if ((is_alnum && ISALNUM(ch)) - || (is_alpha && ISALPHA(ch)) - || (is_blank && ISBLANK(ch)) - || (is_cntrl && ISCNTRL(ch))) - SET_LIST_BIT(ch); - if ((is_digit && ISDIGIT(ch)) - || (is_graph && ISGRAPH(ch)) - || (is_lower && ISLOWER(ch)) - || (is_print && ISPRINT(ch))) - SET_LIST_BIT(ch); - if ((is_punct && ISPUNCT(ch)) - || (is_space && ISSPACE(ch)) - || (is_upper && ISUPPER(ch)) - || (is_xdigit && ISXDIGIT(ch))) - SET_LIST_BIT(ch); - if (translate && (is_upper || is_lower) - && (ISUPPER(ch) || ISLOWER(ch))) - SET_LIST_BIT(ch); - } - had_char_class = true; -#endif /* libc || wctype.h */ - } else { - c1++; - while (c1--) - PATUNFETCH; - SET_LIST_BIT('['); - SET_LIST_BIT(':'); - had_char_class = false; - } - } else { - had_char_class = false; - SET_LIST_BIT(c); - } - } - - /* Discard any (non)matching list bytes that are all 0 at the - end of the map. Decrease the map-length byte too. */ - while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) - b[-1]--; - b += b[-1]; - } - break; - - - case '(': - if (syntax & RE_NO_BK_PARENS) - goto handle_open; - else - goto normal_char; - - - case ')': - if (syntax & RE_NO_BK_PARENS) - goto handle_close; - else - goto normal_char; - - - case '\n': - if (syntax & RE_NEWLINE_ALT) - goto handle_alt; - else - goto normal_char; - - - case '|': - if (syntax & RE_NO_BK_VBAR) - goto handle_alt; - else - goto normal_char; - - - case '{': - if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) - goto handle_interval; - else - goto normal_char; - - - case '\\': - if (p == pend) - FREE_STACK_RETURN(REG_EESCAPE); - - /* Do not translate the character after the \, so that we can - distinguish, e.g., \B from \b, even if we normally would - translate, e.g., B to b. */ - PATFETCH_RAW(c); - - switch (c) { - case '(': - if (syntax & RE_NO_BK_PARENS) - goto normal_backslash; - - handle_open: - bufp->re_nsub++; - regnum++; - - if (COMPILE_STACK_FULL) { - RETALLOC(compile_stack.stack, compile_stack.size << 1, - compile_stack_elt_t); - if (compile_stack.stack == NULL) - return REG_ESPACE; - - compile_stack.size <<= 1; - } - - /* These are the values to restore when we hit end of this - group. They are all relative offsets, so that if the - whole pattern moves because of realloc, they will still - be valid. */ - COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; - COMPILE_STACK_TOP.fixup_alt_jump - = - fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; - COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; - COMPILE_STACK_TOP.regnum = regnum; - - /* We will eventually replace the 0 with the number of - groups inner to this one. But do not push a - start_memory for groups beyond the last one we can - represent in the compiled pattern. */ - if (regnum <= MAX_REGNUM) { - COMPILE_STACK_TOP.inner_group_offset = - b - bufp->buffer + 2; - BUF_PUSH_3(start_memory, regnum, 0); - } - - compile_stack.avail++; - - fixup_alt_jump = 0; - laststart = 0; - begalt = b; - /* If we've reached MAX_REGNUM groups, then this open - won't actually generate any code, so we'll have to - clear pending_exact explicitly. */ - pending_exact = 0; - break; - - - case ')': - if (syntax & RE_NO_BK_PARENS) - goto normal_backslash; - - if (COMPILE_STACK_EMPTY) { - if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) - goto normal_backslash; - else - FREE_STACK_RETURN(REG_ERPAREN); - } - - handle_close: - if (fixup_alt_jump) { /* Push a dummy failure point at the end of the - alternative for a possible future - `pop_failure_jump' to pop. See comments at - `push_dummy_failure' in `re_match_2'. */ - BUF_PUSH(push_dummy_failure); - - /* We allocated space for this jump when we assigned - to `fixup_alt_jump', in the `handle_alt' case below. */ - STORE_JUMP(jump_past_alt, fixup_alt_jump, b - 1); - } - - /* See similar code for backslashed left paren above. */ - if (COMPILE_STACK_EMPTY) { - if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) - goto normal_char; - else - FREE_STACK_RETURN(REG_ERPAREN); - } - - /* Since we just checked for an empty stack above, this - ``can't happen''. */ - assert(compile_stack.avail != 0); - { - /* We don't just want to restore into `regnum', because - later groups should continue to be numbered higher, - as in `(ab)c(de)' -- the second group is #2. */ - regnum_t this_group_regnum; - - compile_stack.avail--; - begalt = - bufp->buffer + COMPILE_STACK_TOP.begalt_offset; - fixup_alt_jump = - COMPILE_STACK_TOP.fixup_alt_jump ? bufp->buffer + - COMPILE_STACK_TOP.fixup_alt_jump - 1 : 0; - laststart = - bufp->buffer + COMPILE_STACK_TOP.laststart_offset; - this_group_regnum = COMPILE_STACK_TOP.regnum; - /* If we've reached MAX_REGNUM groups, then this open - won't actually generate any code, so we'll have to - clear pending_exact explicitly. */ - pending_exact = 0; - - /* We're at the end of the group, so now we know how many - groups were inside this one. */ - if (this_group_regnum <= MAX_REGNUM) { - unsigned char *inner_group_loc - - = - bufp->buffer + - COMPILE_STACK_TOP.inner_group_offset; - - *inner_group_loc = regnum - this_group_regnum; - BUF_PUSH_3(stop_memory, this_group_regnum, - regnum - this_group_regnum); - } - } - break; - - - case '|': /* `\|'. */ - if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) - goto normal_backslash; - handle_alt: - if (syntax & RE_LIMITED_OPS) - goto normal_char; - - /* Insert before the previous alternative a jump which - jumps to this alternative if the former fails. */ - GET_BUFFER_SPACE(3); - INSERT_JUMP(on_failure_jump, begalt, b + 6); - pending_exact = 0; - b += 3; - - /* The alternative before this one has a jump after it - which gets executed if it gets matched. Adjust that - jump so it will jump to this alternative's analogous - jump (put in below, which in turn will jump to the next - (if any) alternative's such jump, etc.). The last such - jump jumps to the correct final destination. A picture: - _____ _____ - | | | | - | v | v - a | b | c - - If we are at `b', then fixup_alt_jump right now points to a - three-byte space after `a'. We'll put in the jump, set - fixup_alt_jump to right after `b', and leave behind three - bytes which we'll fill in when we get to after `c'. */ - - if (fixup_alt_jump) - STORE_JUMP(jump_past_alt, fixup_alt_jump, b); - - /* Mark and leave space for a jump after this alternative, - to be filled in later either by next alternative or - when know we're at the end of a series of alternatives. */ - fixup_alt_jump = b; - GET_BUFFER_SPACE(3); - b += 3; - - laststart = 0; - begalt = b; - break; - - - case '{': - /* If \{ is a literal. */ - if (!(syntax & RE_INTERVALS) - /* If we're at `\{' and it's not the open-interval - operator. */ - || ((syntax & RE_INTERVALS) - && (syntax & RE_NO_BK_BRACES)) || (p - 2 == pattern - && p == pend)) - goto normal_backslash; - - handle_interval: - { - /* If got here, then the syntax allows intervals. */ - - /* At least (most) this many matches must be made. */ - int lower_bound = -1, upper_bound = -1; - - beg_interval = p - 1; - - if (p == pend) { - if (!(syntax & RE_INTERVALS) - && (syntax & RE_NO_BK_BRACES)) goto - unfetch_interval; - else - FREE_STACK_RETURN(REG_EBRACE); - } - - GET_UNSIGNED_NUMBER(lower_bound); - - if (c == ',') { - GET_UNSIGNED_NUMBER(upper_bound); - if ((!(syntax & RE_NO_BK_BRACES) && c != '\\') - || ((syntax & RE_NO_BK_BRACES) && c != '}')) - FREE_STACK_RETURN(REG_BADBR); - - if (upper_bound < 0) - upper_bound = RE_DUP_MAX; - } else - /* Interval such as `{1}' => match exactly once. */ - upper_bound = lower_bound; - - if (lower_bound < 0 || upper_bound > RE_DUP_MAX - || lower_bound > upper_bound) { - if (!(syntax & RE_INTERVALS) - && (syntax & RE_NO_BK_BRACES)) goto - unfetch_interval; - else - FREE_STACK_RETURN(REG_BADBR); - } - - if (!(syntax & RE_NO_BK_BRACES)) { - if (c != '\\') - FREE_STACK_RETURN(REG_EBRACE); - - PATFETCH(c); - } - - if (c != '}') { - if (!(syntax & RE_INTERVALS) - && (syntax & RE_NO_BK_BRACES)) goto - unfetch_interval; - else - FREE_STACK_RETURN(REG_BADBR); - } - - /* We just parsed a valid interval. */ - - /* If it's invalid to have no preceding re. */ - if (!laststart) { - if (syntax & RE_CONTEXT_INVALID_OPS) - FREE_STACK_RETURN(REG_BADRPT); - else if (syntax & RE_CONTEXT_INDEP_OPS) - laststart = b; - else - goto unfetch_interval; - } - - /* If the upper bound is zero, don't want to succeed at - all; jump from `laststart' to `b + 3', which will be - the end of the buffer after we insert the jump. */ - if (upper_bound == 0) { - GET_BUFFER_SPACE(3); - INSERT_JUMP(jump, laststart, b + 3); - b += 3; - } - - /* Otherwise, we have a nontrivial interval. When - we're all done, the pattern will look like: - set_number_at <jump count> <upper bound> - set_number_at <succeed_n count> <lower bound> - succeed_n <after jump addr> <succeed_n count> - <body of loop> - jump_n <succeed_n addr> <jump count> - (The upper bound and `jump_n' are omitted if - `upper_bound' is 1, though.) */ - else { /* If the upper bound is > 1, we need to insert - more at the end of the loop. */ - unsigned nbytes = 10 + (upper_bound > 1) * 10; - - GET_BUFFER_SPACE(nbytes); - - /* Initialize lower bound of the `succeed_n', even - though it will be set during matching by its - attendant `set_number_at' (inserted next), - because `re_compile_fastmap' needs to know. - Jump to the `jump_n' we might insert below. */ - INSERT_JUMP2(succeed_n, laststart, - b + 5 + (upper_bound > 1) * 5, - lower_bound); - b += 5; - - /* Code to initialize the lower bound. Insert - before the `succeed_n'. The `5' is the last two - bytes of this `set_number_at', plus 3 bytes of - the following `succeed_n'. */ - insert_op2(set_number_at, laststart, 5, - lower_bound, b); - b += 5; - - if (upper_bound > 1) { /* More than one repetition is allowed, so - append a backward jump to the `succeed_n' - that starts this interval. - - When we've reached this during matching, - we'll have matched the interval once, so - jump back only `upper_bound - 1' times. */ - STORE_JUMP2(jump_n, b, laststart + 5, - upper_bound - 1); - b += 5; - - /* The location we want to set is the second - parameter of the `jump_n'; that is `b-2' as - an absolute address. `laststart' will be - the `set_number_at' we're about to insert; - `laststart+3' the number to set, the source - for the relative address. But we are - inserting into the middle of the pattern -- - so everything is getting moved up by 5. - Conclusion: (b - 2) - (laststart + 3) + 5, - i.e., b - laststart. - - We insert this at the beginning of the loop - so that if we fail during matching, we'll - reinitialize the bounds. */ - insert_op2(set_number_at, laststart, - b - laststart, upper_bound - 1, b); - b += 5; - } - } - pending_exact = 0; - beg_interval = NULL; - } - break; - - unfetch_interval: - /* If an invalid interval, match the characters as literals. */ - assert(beg_interval); - p = beg_interval; - beg_interval = NULL; - - /* normal_char and normal_backslash need `c'. */ - PATFETCH(c); - - if (!(syntax & RE_NO_BK_BRACES)) { - if (p > pattern && p[-1] == '\\') - goto normal_backslash; - } - goto normal_char; - -#ifdef emacs - /* There is no way to specify the before_dot and after_dot - operators. rms says this is ok. --karl */ - case '=': - BUF_PUSH(at_dot); - break; - - case 's': - laststart = b; - PATFETCH(c); - BUF_PUSH_2(syntaxspec, syntax_spec_code[c]); - break; - - case 'S': - laststart = b; - PATFETCH(c); - BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]); - break; -#endif /* emacs */ - - - case 'w': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - laststart = b; - BUF_PUSH(wordchar); - break; - - - case 'W': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - laststart = b; - BUF_PUSH(notwordchar); - break; - - - case '<': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - BUF_PUSH(wordbeg); - break; - - case '>': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - BUF_PUSH(wordend); - break; - - case 'b': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - BUF_PUSH(wordbound); - break; - - case 'B': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - BUF_PUSH(notwordbound); - break; - - case '`': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - BUF_PUSH(begbuf); - break; - - case '\'': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - BUF_PUSH(endbuf); - break; - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - if (syntax & RE_NO_BK_REFS) - goto normal_char; - - c1 = c - '0'; - - if (c1 > regnum) - FREE_STACK_RETURN(REG_ESUBREG); - - /* Can't back reference to a subexpression if inside of it. */ - if (group_in_compile_stack(compile_stack, (regnum_t) c1)) - goto normal_char; - - laststart = b; - BUF_PUSH_2(duplicate, c1); - break; - - - case '+': - case '?': - if (syntax & RE_BK_PLUS_QM) - goto handle_plus; - else - goto normal_backslash; - - default: - normal_backslash: - /* You might think it would be useful for \ to mean - not to translate; but if we don't translate it - it will never match anything. */ - c = TRANSLATE(c); - goto normal_char; - } - break; - - - default: - /* Expects the character in `c'. */ - normal_char: - /* If no exactn currently being built. */ - if (!pending_exact - /* If last exactn not at current position. */ - || pending_exact + *pending_exact + 1 != b - /* We have only one byte following the exactn for the count. */ - || *pending_exact == (1 << BYTEWIDTH) - 1 - /* If followed by a repetition operator. */ - || *p == '*' || *p == '^' || ((syntax & RE_BK_PLUS_QM) - ? *p == '\\' && (p[1] == '+' - || p[1] == - '?') : (*p - == - '+' - || - *p - == - '?')) - || ((syntax & RE_INTERVALS) - && ((syntax & RE_NO_BK_BRACES) - ? *p == '{' : (p[0] == '\\' && p[1] == '{')))) { - /* Start building a new exactn. */ - - laststart = b; - - BUF_PUSH_2(exactn, 0); - pending_exact = b - 1; - } - - BUF_PUSH(c); - (*pending_exact)++; - break; - } /* switch (c) */ - } /* while p != pend */ - - - /* Through the pattern now. */ - - if (fixup_alt_jump) - STORE_JUMP(jump_past_alt, fixup_alt_jump, b); - - if (!COMPILE_STACK_EMPTY) - FREE_STACK_RETURN(REG_EPAREN); - - /* If we don't want backtracking, force success - the first time we reach the end of the compiled pattern. */ - if (syntax & RE_NO_POSIX_BACKTRACKING) - BUF_PUSH(succeed); - - free(compile_stack.stack); - - /* We have succeeded; set the length of the buffer. */ - bufp->used = b - bufp->buffer; - -#ifdef DEBUG - if (debug) { - DEBUG_PRINT1("\nCompiled pattern: \n"); - print_compiled_pattern(bufp); - } -#endif /* DEBUG */ - -#ifndef MATCH_MAY_ALLOCATE - /* Initialize the failure stack to the largest possible stack. This - isn't necessary unless we're trying to avoid calling alloca in - the search and match routines. */ - { - int num_regs = bufp->re_nsub + 1; - - /* Since DOUBLE_FAIL_STACK refuses to double only if the current size - is strictly greater than re_max_failures, the largest possible stack - is 2 * re_max_failures failure points. */ - if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) { - fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); - -# ifdef emacs - if (!fail_stack.stack) - fail_stack.stack - = (fail_stack_elt_t *) xmalloc(fail_stack.size - * - sizeof - (fail_stack_elt_t)); - else - fail_stack.stack = - (fail_stack_elt_t *) xrealloc(fail_stack.stack, - (fail_stack.size * - sizeof - (fail_stack_elt_t))); -# else /* not emacs */ - if (!fail_stack.stack) - fail_stack.stack - = (fail_stack_elt_t *) malloc(fail_stack.size - * - sizeof - (fail_stack_elt_t)); - else - fail_stack.stack = - (fail_stack_elt_t *) realloc(fail_stack.stack, - (fail_stack.size * - sizeof - (fail_stack_elt_t))); -# endif /* not emacs */ - } - - regex_grow_registers(num_regs); - } -#endif /* not MATCH_MAY_ALLOCATE */ - - return REG_NOERROR; -} /* regex_compile */ - /* Subroutines for `regex_compile'. */ /* Store OP at LOC followed by two-byte integer parameter ARG. */ -static void store_op1(op, loc, arg) +static inline void store_op1(op, loc, arg) re_opcode_t op; unsigned char *loc; int arg; @@ -3068,6 +1940,10 @@ reg_syntax_t syntax; /* Returns true if REGNUM is in one of COMPILE_STACK's elements and false if it's not. */ +static boolean group_in_compile_stack _RE_ARGS((compile_stack_type + compile_stack, + regnum_t regnum)); + static boolean group_in_compile_stack(compile_stack, regnum) compile_stack_type compile_stack; regnum_t regnum; @@ -5727,3 +4603,1126 @@ regex_t *preg; weak_alias(__regfree, regfree) #endif #endif /* not emacs */ + +/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. + Returns one of error codes defined in `regex.h', or zero for success. + + Assumes the `allocated' (and perhaps `buffer') and `translate' + fields are set in BUFP on entry. + + If it succeeds, results are put in BUFP (if it returns an error, the + contents of BUFP are undefined): + `buffer' is the compiled pattern; + `syntax' is set to SYNTAX; + `used' is set to the length of the compiled pattern; + `fastmap_accurate' is zero; + `re_nsub' is the number of subexpressions in PATTERN; + `not_bol' and `not_eol' are zero; + + The `fastmap' and `newline_anchor' fields are neither + examined nor set. */ + +/* Return, freeing storage we allocated. */ +#define FREE_STACK_RETURN(value) \ + return (free (compile_stack.stack), value) + +static reg_errcode_t regex_compile(pattern, size, syntax, bufp) +const char *pattern; +size_t size; +reg_syntax_t syntax; +struct re_pattern_buffer *bufp; +{ + /* We fetch characters from PATTERN here. Even though PATTERN is + `char *' (i.e., signed), we declare these variables as unsigned, so + they can be reliably used as array indices. */ + register unsigned char c, c1; + + /* A random temporary spot in PATTERN. */ + const char *p1; + + /* Points to the end of the buffer, where we should append. */ + register unsigned char *b; + + /* Keeps track of unclosed groups. */ + compile_stack_type compile_stack; + + /* Points to the current (ending) position in the pattern. */ + const char *p = pattern; + const char *pend = pattern + size; + + /* How to translate the characters in the pattern. */ + RE_TRANSLATE_TYPE translate = bufp->translate; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell if a new exact-match + character can be added to that command or if the character requires + a new `exactn' command. */ + unsigned char *pending_exact = 0; + + /* Address of start of the most recently finished expression. + This tells, e.g., postfix * where to find the start of its + operand. Reset at the beginning of groups and alternatives. */ + unsigned char *laststart = 0; + + /* Address of beginning of regexp, or inside of last group. */ + unsigned char *begalt; + + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + const char *beg_interval; + + /* Address of the place where a forward jump should go to the end of + the containing expression. Each alternative of an `or' -- except the + last -- ends with a forward jump of this sort. */ + unsigned char *fixup_alt_jump = 0; + + /* Counts open-groups as they are encountered. Remembered for the + matching close-group on the compile stack, so the same register + number is put in the stop_memory as the start_memory. */ + regnum_t regnum = 0; + +#ifdef DEBUG + DEBUG_PRINT1("\nCompiling pattern: "); + if (debug) { + unsigned debug_count; + + for (debug_count = 0; debug_count < size; debug_count++) + putchar(pattern[debug_count]); + putchar('\n'); + } +#endif /* DEBUG */ + + /* Initialize the compile stack. */ + compile_stack.stack = + TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size = INIT_COMPILE_STACK_SIZE; + compile_stack.avail = 0; + + /* Initialize the pattern buffer. */ + bufp->syntax = syntax; + bufp->fastmap_accurate = 0; + bufp->not_bol = bufp->not_eol = 0; + + /* Set `used' to zero, so that if we return an error, the pattern + printer (for debugging) will think there's no pattern. We reset it + at the end. */ + bufp->used = 0; + + /* Always count groups, whether or not bufp->no_sub is set. */ + bufp->re_nsub = 0; + +#if !defined emacs && !defined SYNTAX_TABLE + /* Initialize the syntax table. */ + init_syntax_once(); +#endif + + if (bufp->allocated == 0) { + if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. */ + RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char); + } else { /* Caller did not allocate a buffer. Do it for them. */ + bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char); + } + if (!bufp->buffer) + FREE_STACK_RETURN(REG_ESPACE); + + bufp->allocated = INIT_BUF_SIZE; + } + + begalt = b = bufp->buffer; + + /* Loop through the uncompiled pattern until we're at the end. */ + while (p != pend) { + PATFETCH(c); + + switch (c) { + case '^': + { + if ( /* If at start of pattern, it's an operator. */ + p == pattern + 1 + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's come before. */ + || at_begline_loc_p(pattern, p, syntax)) + BUF_PUSH(begline); + else + goto normal_char; + } + break; + + + case '$': + { + if ( /* If at end of pattern, it's an operator. */ + p == pend + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's next. */ + || at_endline_loc_p(p, pend, syntax)) + BUF_PUSH(endline); + else + goto normal_char; + } + break; + + + case '+': + case '?': + if ((syntax & RE_BK_PLUS_QM) + || (syntax & RE_LIMITED_OPS)) + goto normal_char; + handle_plus: + case '*': + /* If there is no previous pattern... */ + if (!laststart) { + if (syntax & RE_CONTEXT_INVALID_OPS) + FREE_STACK_RETURN(REG_BADRPT); + else if (!(syntax & RE_CONTEXT_INDEP_OPS)) + goto normal_char; + } + + { + /* Are we optimizing this jump? */ + boolean keep_string_p = false; + + /* 1 means zero (many) matches is allowed. */ + char zero_times_ok = 0, many_times_ok = 0; + + /* If there is a sequence of repetition chars, collapse it + down to just one (the right one). We can't combine + interval operators with these because of, e.g., `a{2}*', + which should only match an even number of `a's. */ + + for (;;) { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + + if (p == pend) + break; + + PATFETCH(c); + + if (c == '*' + || (!(syntax & RE_BK_PLUS_QM) + && (c == '+' || c == '?'))); + + else if (syntax & RE_BK_PLUS_QM && c == '\\') { + if (p == pend) + FREE_STACK_RETURN(REG_EESCAPE); + + PATFETCH(c1); + if (!(c1 == '+' || c1 == '?')) { + PATUNFETCH; + PATUNFETCH; + break; + } + + c = c1; + } else { + PATUNFETCH; + break; + } + + /* If we get here, we found another repeat character. */ + } + + /* Star, etc. applied to an empty pattern is equivalent + to an empty pattern. */ + if (!laststart) + break; + + /* Now we know whether or not zero matches is allowed + and also whether or not two or more matches is allowed. */ + if (many_times_ok) { /* More than one repetition is allowed, so put in at the + end a backward relative jump from `b' to before the next + jump we're going to put in below (which jumps from + laststart to after this jump). + + But if we are at the `*' in the exact sequence `.*\n', + insert an unconditional jump backwards to the ., + instead of the beginning of the loop. This way we only + push a failure point once, instead of every time + through the loop. */ + assert(p - 1 > pattern); + + /* Allocate the space for the jump. */ + GET_BUFFER_SPACE(3); + + /* We know we are not at the first character of the pattern, + because laststart was nonzero. And we've already + incremented `p', by the way, to be the character after + the `*'. Do we have to do something analogous here + for null bytes, because of RE_DOT_NOT_NULL? */ + if (TRANSLATE(*(p - 2)) == TRANSLATE('.') + && zero_times_ok + && p < pend && TRANSLATE(*p) == TRANSLATE('\n') + && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ + STORE_JUMP(jump, b, laststart); + keep_string_p = true; + } else + /* Anything else. */ + STORE_JUMP(maybe_pop_jump, b, laststart - 3); + + /* We've added more stuff to the buffer. */ + b += 3; + } + + /* On failure, jump from laststart to b + 3, which will be the + end of the buffer after this jump is inserted. */ + GET_BUFFER_SPACE(3); + INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump + : on_failure_jump, laststart, b + 3); + pending_exact = 0; + b += 3; + + if (!zero_times_ok) { + /* At least one repetition is required, so insert a + `dummy_failure_jump' before the initial + `on_failure_jump' instruction of the loop. This + effects a skip over that instruction the first time + we hit that loop. */ + GET_BUFFER_SPACE(3); + INSERT_JUMP(dummy_failure_jump, laststart, + laststart + 6); + b += 3; + } + } + break; + + + case '.': + laststart = b; + BUF_PUSH(anychar); + break; + + + case '[': + { + boolean had_char_class = false; + + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + /* Ensure that we have enough space to push a charset: the + opcode, the length count, and the bitset; 34 bytes in all. */ + GET_BUFFER_SPACE(34); + + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH(*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* Push the number of bytes in the bitmap. */ + BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH); + + /* Clear the whole map. */ + bzero(b, (1 << BYTEWIDTH) / BYTEWIDTH); + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-2] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT('\n'); + + /* Read in characters and ranges, setting map bits. */ + for (;;) { + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + PATFETCH(c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') { + if (p == pend) + FREE_STACK_RETURN(REG_EESCAPE); + + PATFETCH(c1); + SET_LIST_BIT(c1); + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + FREE_STACK_RETURN(REG_ERANGE); + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') { + reg_errcode_t ret + = compile_range(&p, pend, translate, syntax, b); + + if (ret != REG_NOERROR) + FREE_STACK_RETURN(ret); + } + + else if (p[0] == '-' && p[1] != ']') { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH(c1); + + ret = compile_range(&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) + FREE_STACK_RETURN(ret); + } + + /* See if we're at the beginning of a possible character + class. */ + + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH(c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + for (;;) { + PATFETCH(c); + if ((c == ':' && *p == ']') || p == pend) + break; + if (c1 < CHAR_CLASS_MAX_LENGTH) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and `:]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but set bits for them). */ + if (c == ':' && *p == ']') { +#if defined _LIBC || WIDE_CHAR_SUPPORT + boolean is_lower = STREQ(str, "lower"); + boolean is_upper = STREQ(str, "upper"); + wctype_t wt; + int ch; + + wt = IS_CHAR_CLASS(str); + if (wt == 0) + FREE_STACK_RETURN(REG_ECTYPE); + + /* Throw away the ] at the end of the character + class. */ + PATFETCH(c); + + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) { +# ifdef _LIBC + if (__iswctype(__btowc(ch), wt)) + SET_LIST_BIT(ch); +# else + if (iswctype(btowc(ch), wt)) + SET_LIST_BIT(ch); +# endif + + if (translate && (is_upper || is_lower) + && (ISUPPER(ch) || ISLOWER(ch))) + SET_LIST_BIT(ch); + } + + had_char_class = true; +#else + int ch; + boolean is_alnum = STREQ(str, "alnum"); + boolean is_alpha = STREQ(str, "alpha"); + boolean is_blank = STREQ(str, "blank"); + boolean is_cntrl = STREQ(str, "cntrl"); + boolean is_digit = STREQ(str, "digit"); + boolean is_graph = STREQ(str, "graph"); + boolean is_lower = STREQ(str, "lower"); + boolean is_print = STREQ(str, "print"); + boolean is_punct = STREQ(str, "punct"); + boolean is_space = STREQ(str, "space"); + boolean is_upper = STREQ(str, "upper"); + boolean is_xdigit = STREQ(str, "xdigit"); + + if (!IS_CHAR_CLASS(str)) + FREE_STACK_RETURN(REG_ECTYPE); + + /* Throw away the ] at the end of the character + class. */ + PATFETCH(c); + + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + for (ch = 0; ch < 1 << BYTEWIDTH; ch++) { + /* This was split into 3 if's to + avoid an arbitrary limit in some compiler. */ + if ((is_alnum && ISALNUM(ch)) + || (is_alpha && ISALPHA(ch)) + || (is_blank && ISBLANK(ch)) + || (is_cntrl && ISCNTRL(ch))) + SET_LIST_BIT(ch); + if ((is_digit && ISDIGIT(ch)) + || (is_graph && ISGRAPH(ch)) + || (is_lower && ISLOWER(ch)) + || (is_print && ISPRINT(ch))) + SET_LIST_BIT(ch); + if ((is_punct && ISPUNCT(ch)) + || (is_space && ISSPACE(ch)) + || (is_upper && ISUPPER(ch)) + || (is_xdigit && ISXDIGIT(ch))) + SET_LIST_BIT(ch); + if (translate && (is_upper || is_lower) + && (ISUPPER(ch) || ISLOWER(ch))) + SET_LIST_BIT(ch); + } + had_char_class = true; +#endif /* libc || wctype.h */ + } else { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT('['); + SET_LIST_BIT(':'); + had_char_class = false; + } + } else { + had_char_class = false; + SET_LIST_BIT(c); + } + } + + /* Discard any (non)matching list bytes that are all 0 at the + end of the map. Decrease the map-length byte too. */ + while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) + b[-1]--; + b += b[-1]; + } + break; + + + case '(': + if (syntax & RE_NO_BK_PARENS) + goto handle_open; + else + goto normal_char; + + + case ')': + if (syntax & RE_NO_BK_PARENS) + goto handle_close; + else + goto normal_char; + + + case '\n': + if (syntax & RE_NEWLINE_ALT) + goto handle_alt; + else + goto normal_char; + + + case '|': + if (syntax & RE_NO_BK_VBAR) + goto handle_alt; + else + goto normal_char; + + + case '{': + if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) + goto handle_interval; + else + goto normal_char; + + + case '\\': + if (p == pend) + FREE_STACK_RETURN(REG_EESCAPE); + + /* Do not translate the character after the \, so that we can + distinguish, e.g., \B from \b, even if we normally would + translate, e.g., B to b. */ + PATFETCH_RAW(c); + + switch (c) { + case '(': + if (syntax & RE_NO_BK_PARENS) + goto normal_backslash; + + handle_open: + bufp->re_nsub++; + regnum++; + + if (COMPILE_STACK_FULL) { + RETALLOC(compile_stack.stack, compile_stack.size << 1, + compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size <<= 1; + } + + /* These are the values to restore when we hit end of this + group. They are all relative offsets, so that if the + whole pattern moves because of realloc, they will still + be valid. */ + COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; + COMPILE_STACK_TOP.fixup_alt_jump + = + fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; + COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; + COMPILE_STACK_TOP.regnum = regnum; + + /* We will eventually replace the 0 with the number of + groups inner to this one. But do not push a + start_memory for groups beyond the last one we can + represent in the compiled pattern. */ + if (regnum <= MAX_REGNUM) { + COMPILE_STACK_TOP.inner_group_offset = + b - bufp->buffer + 2; + BUF_PUSH_3(start_memory, regnum, 0); + } + + compile_stack.avail++; + + fixup_alt_jump = 0; + laststart = 0; + begalt = b; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + break; + + + case ')': + if (syntax & RE_NO_BK_PARENS) + goto normal_backslash; + + if (COMPILE_STACK_EMPTY) { + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_backslash; + else + FREE_STACK_RETURN(REG_ERPAREN); + } + + handle_close: + if (fixup_alt_jump) { /* Push a dummy failure point at the end of the + alternative for a possible future + `pop_failure_jump' to pop. See comments at + `push_dummy_failure' in `re_match_2'. */ + BUF_PUSH(push_dummy_failure); + + /* We allocated space for this jump when we assigned + to `fixup_alt_jump', in the `handle_alt' case below. */ + STORE_JUMP(jump_past_alt, fixup_alt_jump, b - 1); + } + + /* See similar code for backslashed left paren above. */ + if (COMPILE_STACK_EMPTY) { + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + else + FREE_STACK_RETURN(REG_ERPAREN); + } + + /* Since we just checked for an empty stack above, this + ``can't happen''. */ + assert(compile_stack.avail != 0); + { + /* We don't just want to restore into `regnum', because + later groups should continue to be numbered higher, + as in `(ab)c(de)' -- the second group is #2. */ + regnum_t this_group_regnum; + + compile_stack.avail--; + begalt = + bufp->buffer + COMPILE_STACK_TOP.begalt_offset; + fixup_alt_jump = + COMPILE_STACK_TOP.fixup_alt_jump ? bufp->buffer + + COMPILE_STACK_TOP.fixup_alt_jump - 1 : 0; + laststart = + bufp->buffer + COMPILE_STACK_TOP.laststart_offset; + this_group_regnum = COMPILE_STACK_TOP.regnum; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + + /* We're at the end of the group, so now we know how many + groups were inside this one. */ + if (this_group_regnum <= MAX_REGNUM) { + unsigned char *inner_group_loc + + = + bufp->buffer + + COMPILE_STACK_TOP.inner_group_offset; + + *inner_group_loc = regnum - this_group_regnum; + BUF_PUSH_3(stop_memory, this_group_regnum, + regnum - this_group_regnum); + } + } + break; + + + case '|': /* `\|'. */ + if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) + goto normal_backslash; + handle_alt: + if (syntax & RE_LIMITED_OPS) + goto normal_char; + + /* Insert before the previous alternative a jump which + jumps to this alternative if the former fails. */ + GET_BUFFER_SPACE(3); + INSERT_JUMP(on_failure_jump, begalt, b + 6); + pending_exact = 0; + b += 3; + + /* The alternative before this one has a jump after it + which gets executed if it gets matched. Adjust that + jump so it will jump to this alternative's analogous + jump (put in below, which in turn will jump to the next + (if any) alternative's such jump, etc.). The last such + jump jumps to the correct final destination. A picture: + _____ _____ + | | | | + | v | v + a | b | c + + If we are at `b', then fixup_alt_jump right now points to a + three-byte space after `a'. We'll put in the jump, set + fixup_alt_jump to right after `b', and leave behind three + bytes which we'll fill in when we get to after `c'. */ + + if (fixup_alt_jump) + STORE_JUMP(jump_past_alt, fixup_alt_jump, b); + + /* Mark and leave space for a jump after this alternative, + to be filled in later either by next alternative or + when know we're at the end of a series of alternatives. */ + fixup_alt_jump = b; + GET_BUFFER_SPACE(3); + b += 3; + + laststart = 0; + begalt = b; + break; + + + case '{': + /* If \{ is a literal. */ + if (!(syntax & RE_INTERVALS) + /* If we're at `\{' and it's not the open-interval + operator. */ + || ((syntax & RE_INTERVALS) + && (syntax & RE_NO_BK_BRACES)) || (p - 2 == pattern + && p == pend)) + goto normal_backslash; + + handle_interval: + { + /* If got here, then the syntax allows intervals. */ + + /* At least (most) this many matches must be made. */ + int lower_bound = -1, upper_bound = -1; + + beg_interval = p - 1; + + if (p == pend) { + if (!(syntax & RE_INTERVALS) + && (syntax & RE_NO_BK_BRACES)) goto + unfetch_interval; + else + FREE_STACK_RETURN(REG_EBRACE); + } + + GET_UNSIGNED_NUMBER(lower_bound); + + if (c == ',') { + GET_UNSIGNED_NUMBER(upper_bound); + if ((!(syntax & RE_NO_BK_BRACES) && c != '\\') + || ((syntax & RE_NO_BK_BRACES) && c != '}')) + FREE_STACK_RETURN(REG_BADBR); + + if (upper_bound < 0) + upper_bound = RE_DUP_MAX; + } else + /* Interval such as `{1}' => match exactly once. */ + upper_bound = lower_bound; + + if (lower_bound < 0 || upper_bound > RE_DUP_MAX + || lower_bound > upper_bound) { + if (!(syntax & RE_INTERVALS) + && (syntax & RE_NO_BK_BRACES)) goto + unfetch_interval; + else + FREE_STACK_RETURN(REG_BADBR); + } + + if (!(syntax & RE_NO_BK_BRACES)) { + if (c != '\\') + FREE_STACK_RETURN(REG_EBRACE); + + PATFETCH(c); + } + + if (c != '}') { + if (!(syntax & RE_INTERVALS) + && (syntax & RE_NO_BK_BRACES)) goto + unfetch_interval; + else + FREE_STACK_RETURN(REG_BADBR); + } + + /* We just parsed a valid interval. */ + + /* If it's invalid to have no preceding re. */ + if (!laststart) { + if (syntax & RE_CONTEXT_INVALID_OPS) + FREE_STACK_RETURN(REG_BADRPT); + else if (syntax & RE_CONTEXT_INDEP_OPS) + laststart = b; + else + goto unfetch_interval; + } + + /* If the upper bound is zero, don't want to succeed at + all; jump from `laststart' to `b + 3', which will be + the end of the buffer after we insert the jump. */ + if (upper_bound == 0) { + GET_BUFFER_SPACE(3); + INSERT_JUMP(jump, laststart, b + 3); + b += 3; + } + + /* Otherwise, we have a nontrivial interval. When + we're all done, the pattern will look like: + set_number_at <jump count> <upper bound> + set_number_at <succeed_n count> <lower bound> + succeed_n <after jump addr> <succeed_n count> + <body of loop> + jump_n <succeed_n addr> <jump count> + (The upper bound and `jump_n' are omitted if + `upper_bound' is 1, though.) */ + else { /* If the upper bound is > 1, we need to insert + more at the end of the loop. */ + unsigned nbytes = 10 + (upper_bound > 1) * 10; + + GET_BUFFER_SPACE(nbytes); + + /* Initialize lower bound of the `succeed_n', even + though it will be set during matching by its + attendant `set_number_at' (inserted next), + because `re_compile_fastmap' needs to know. + Jump to the `jump_n' we might insert below. */ + INSERT_JUMP2(succeed_n, laststart, + b + 5 + (upper_bound > 1) * 5, + lower_bound); + b += 5; + + /* Code to initialize the lower bound. Insert + before the `succeed_n'. The `5' is the last two + bytes of this `set_number_at', plus 3 bytes of + the following `succeed_n'. */ + insert_op2(set_number_at, laststart, 5, + lower_bound, b); + b += 5; + + if (upper_bound > 1) { /* More than one repetition is allowed, so + append a backward jump to the `succeed_n' + that starts this interval. + + When we've reached this during matching, + we'll have matched the interval once, so + jump back only `upper_bound - 1' times. */ + STORE_JUMP2(jump_n, b, laststart + 5, + upper_bound - 1); + b += 5; + + /* The location we want to set is the second + parameter of the `jump_n'; that is `b-2' as + an absolute address. `laststart' will be + the `set_number_at' we're about to insert; + `laststart+3' the number to set, the source + for the relative address. But we are + inserting into the middle of the pattern -- + so everything is getting moved up by 5. + Conclusion: (b - 2) - (laststart + 3) + 5, + i.e., b - laststart. + + We insert this at the beginning of the loop + so that if we fail during matching, we'll + reinitialize the bounds. */ + insert_op2(set_number_at, laststart, + b - laststart, upper_bound - 1, b); + b += 5; + } + } + pending_exact = 0; + beg_interval = NULL; + } + break; + + unfetch_interval: + /* If an invalid interval, match the characters as literals. */ + assert(beg_interval); + p = beg_interval; + beg_interval = NULL; + + /* normal_char and normal_backslash need `c'. */ + PATFETCH(c); + + if (!(syntax & RE_NO_BK_BRACES)) { + if (p > pattern && p[-1] == '\\') + goto normal_backslash; + } + goto normal_char; + +#ifdef emacs + /* There is no way to specify the before_dot and after_dot + operators. rms says this is ok. --karl */ + case '=': + BUF_PUSH(at_dot); + break; + + case 's': + laststart = b; + PATFETCH(c); + BUF_PUSH_2(syntaxspec, syntax_spec_code[c]); + break; + + case 'S': + laststart = b; + PATFETCH(c); + BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]); + break; +#endif /* emacs */ + + + case 'w': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + laststart = b; + BUF_PUSH(wordchar); + break; + + + case 'W': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + laststart = b; + BUF_PUSH(notwordchar); + break; + + + case '<': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(wordbeg); + break; + + case '>': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(wordend); + break; + + case 'b': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(wordbound); + break; + + case 'B': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(notwordbound); + break; + + case '`': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(begbuf); + break; + + case '\'': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(endbuf); + break; + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (syntax & RE_NO_BK_REFS) + goto normal_char; + + c1 = c - '0'; + + if (c1 > regnum) + FREE_STACK_RETURN(REG_ESUBREG); + + /* Can't back reference to a subexpression if inside of it. */ + if (group_in_compile_stack(compile_stack, (regnum_t) c1)) + goto normal_char; + + laststart = b; + BUF_PUSH_2(duplicate, c1); + break; + + + case '+': + case '?': + if (syntax & RE_BK_PLUS_QM) + goto handle_plus; + else + goto normal_backslash; + + default: + normal_backslash: + /* You might think it would be useful for \ to mean + not to translate; but if we don't translate it + it will never match anything. */ + c = TRANSLATE(c); + goto normal_char; + } + break; + + + default: + /* Expects the character in `c'. */ + normal_char: + /* If no exactn currently being built. */ + if (!pending_exact + /* If last exactn not at current position. */ + || pending_exact + *pending_exact + 1 != b + /* We have only one byte following the exactn for the count. */ + || *pending_exact == (1 << BYTEWIDTH) - 1 + /* If followed by a repetition operator. */ + || *p == '*' || *p == '^' || ((syntax & RE_BK_PLUS_QM) + ? *p == '\\' && (p[1] == '+' + || p[1] == + '?') : (*p + == + '+' + || + *p + == + '?')) + || ((syntax & RE_INTERVALS) + && ((syntax & RE_NO_BK_BRACES) + ? *p == '{' : (p[0] == '\\' && p[1] == '{')))) { + /* Start building a new exactn. */ + + laststart = b; + + BUF_PUSH_2(exactn, 0); + pending_exact = b - 1; + } + + BUF_PUSH(c); + (*pending_exact)++; + break; + } /* switch (c) */ + } /* while p != pend */ + + + /* Through the pattern now. */ + + if (fixup_alt_jump) + STORE_JUMP(jump_past_alt, fixup_alt_jump, b); + + if (!COMPILE_STACK_EMPTY) + FREE_STACK_RETURN(REG_EPAREN); + + /* If we don't want backtracking, force success + the first time we reach the end of the compiled pattern. */ + if (syntax & RE_NO_POSIX_BACKTRACKING) + BUF_PUSH(succeed); + + free(compile_stack.stack); + + /* We have succeeded; set the length of the buffer. */ + bufp->used = b - bufp->buffer; + +#ifdef DEBUG + if (debug) { + DEBUG_PRINT1("\nCompiled pattern: \n"); + print_compiled_pattern(bufp); + } +#endif /* DEBUG */ + +#ifndef MATCH_MAY_ALLOCATE + /* Initialize the failure stack to the largest possible stack. This + isn't necessary unless we're trying to avoid calling alloca in + the search and match routines. */ + { + int num_regs = bufp->re_nsub + 1; + + /* Since DOUBLE_FAIL_STACK refuses to double only if the current size + is strictly greater than re_max_failures, the largest possible stack + is 2 * re_max_failures failure points. */ + if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) { + fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); + +# ifdef emacs + if (!fail_stack.stack) + fail_stack.stack + = (fail_stack_elt_t *) xmalloc(fail_stack.size + * + sizeof + (fail_stack_elt_t)); + else + fail_stack.stack = + (fail_stack_elt_t *) xrealloc(fail_stack.stack, + (fail_stack.size * + sizeof + (fail_stack_elt_t))); +# else /* not emacs */ + if (!fail_stack.stack) + fail_stack.stack + = (fail_stack_elt_t *) malloc(fail_stack.size + * + sizeof + (fail_stack_elt_t)); + else + fail_stack.stack = + (fail_stack_elt_t *) realloc(fail_stack.stack, + (fail_stack.size * + sizeof + (fail_stack_elt_t))); +# endif /* not emacs */ + } + + regex_grow_registers(num_regs); + } +#endif /* not MATCH_MAY_ALLOCATE */ + + return REG_NOERROR; +} /* regex_compile */ |