summaryrefslogtreecommitdiff
path: root/libc/misc/regex
diff options
context:
space:
mode:
Diffstat (limited to 'libc/misc/regex')
-rw-r--r--libc/misc/regex/regex.c2257
1 files changed, 1128 insertions, 1129 deletions
diff --git a/libc/misc/regex/regex.c b/libc/misc/regex/regex.c
index d14595dfd..350535fa1 100644
--- a/libc/misc/regex/regex.c
+++ b/libc/misc/regex/regex.c
@@ -1832,1139 +1832,11 @@ int num_regs;
#endif /* not MATCH_MAY_ALLOCATE */
-static boolean group_in_compile_stack _RE_ARGS((compile_stack_type
- compile_stack,
-
- regnum_t regnum));
-
-/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
- Returns one of error codes defined in `regex.h', or zero for success.
-
- Assumes the `allocated' (and perhaps `buffer') and `translate'
- fields are set in BUFP on entry.
-
- If it succeeds, results are put in BUFP (if it returns an error, the
- contents of BUFP are undefined):
- `buffer' is the compiled pattern;
- `syntax' is set to SYNTAX;
- `used' is set to the length of the compiled pattern;
- `fastmap_accurate' is zero;
- `re_nsub' is the number of subexpressions in PATTERN;
- `not_bol' and `not_eol' are zero;
-
- The `fastmap' and `newline_anchor' fields are neither
- examined nor set. */
-
-/* Return, freeing storage we allocated. */
-#define FREE_STACK_RETURN(value) \
- return (free (compile_stack.stack), value)
-
-static reg_errcode_t regex_compile(pattern, size, syntax, bufp)
-const char *pattern;
-size_t size;
-reg_syntax_t syntax;
-struct re_pattern_buffer *bufp;
-{
- /* We fetch characters from PATTERN here. Even though PATTERN is
- `char *' (i.e., signed), we declare these variables as unsigned, so
- they can be reliably used as array indices. */
- register unsigned char c, c1;
-
- /* A random temporary spot in PATTERN. */
- const char *p1;
-
- /* Points to the end of the buffer, where we should append. */
- register unsigned char *b;
-
- /* Keeps track of unclosed groups. */
- compile_stack_type compile_stack;
-
- /* Points to the current (ending) position in the pattern. */
- const char *p = pattern;
- const char *pend = pattern + size;
-
- /* How to translate the characters in the pattern. */
- RE_TRANSLATE_TYPE translate = bufp->translate;
-
- /* Address of the count-byte of the most recently inserted `exactn'
- command. This makes it possible to tell if a new exact-match
- character can be added to that command or if the character requires
- a new `exactn' command. */
- unsigned char *pending_exact = 0;
-
- /* Address of start of the most recently finished expression.
- This tells, e.g., postfix * where to find the start of its
- operand. Reset at the beginning of groups and alternatives. */
- unsigned char *laststart = 0;
-
- /* Address of beginning of regexp, or inside of last group. */
- unsigned char *begalt;
-
- /* Place in the uncompiled pattern (i.e., the {) to
- which to go back if the interval is invalid. */
- const char *beg_interval;
-
- /* Address of the place where a forward jump should go to the end of
- the containing expression. Each alternative of an `or' -- except the
- last -- ends with a forward jump of this sort. */
- unsigned char *fixup_alt_jump = 0;
-
- /* Counts open-groups as they are encountered. Remembered for the
- matching close-group on the compile stack, so the same register
- number is put in the stop_memory as the start_memory. */
- regnum_t regnum = 0;
-
-#ifdef DEBUG
- DEBUG_PRINT1("\nCompiling pattern: ");
- if (debug) {
- unsigned debug_count;
-
- for (debug_count = 0; debug_count < size; debug_count++)
- putchar(pattern[debug_count]);
- putchar('\n');
- }
-#endif /* DEBUG */
-
- /* Initialize the compile stack. */
- compile_stack.stack =
- TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
- if (compile_stack.stack == NULL)
- return REG_ESPACE;
-
- compile_stack.size = INIT_COMPILE_STACK_SIZE;
- compile_stack.avail = 0;
-
- /* Initialize the pattern buffer. */
- bufp->syntax = syntax;
- bufp->fastmap_accurate = 0;
- bufp->not_bol = bufp->not_eol = 0;
-
- /* Set `used' to zero, so that if we return an error, the pattern
- printer (for debugging) will think there's no pattern. We reset it
- at the end. */
- bufp->used = 0;
-
- /* Always count groups, whether or not bufp->no_sub is set. */
- bufp->re_nsub = 0;
-
-#if !defined emacs && !defined SYNTAX_TABLE
- /* Initialize the syntax table. */
- init_syntax_once();
-#endif
-
- if (bufp->allocated == 0) {
- if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc
- enough space. This loses if buffer's address is bogus, but
- that is the user's responsibility. */
- RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
- } else { /* Caller did not allocate a buffer. Do it for them. */
- bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
- }
- if (!bufp->buffer)
- FREE_STACK_RETURN(REG_ESPACE);
-
- bufp->allocated = INIT_BUF_SIZE;
- }
-
- begalt = b = bufp->buffer;
-
- /* Loop through the uncompiled pattern until we're at the end. */
- while (p != pend) {
- PATFETCH(c);
-
- switch (c) {
- case '^':
- {
- if ( /* If at start of pattern, it's an operator. */
- p == pattern + 1
- /* If context independent, it's an operator. */
- || syntax & RE_CONTEXT_INDEP_ANCHORS
- /* Otherwise, depends on what's come before. */
- || at_begline_loc_p(pattern, p, syntax))
- BUF_PUSH(begline);
- else
- goto normal_char;
- }
- break;
-
-
- case '$':
- {
- if ( /* If at end of pattern, it's an operator. */
- p == pend
- /* If context independent, it's an operator. */
- || syntax & RE_CONTEXT_INDEP_ANCHORS
- /* Otherwise, depends on what's next. */
- || at_endline_loc_p(p, pend, syntax))
- BUF_PUSH(endline);
- else
- goto normal_char;
- }
- break;
-
-
- case '+':
- case '?':
- if ((syntax & RE_BK_PLUS_QM)
- || (syntax & RE_LIMITED_OPS))
- goto normal_char;
- handle_plus:
- case '*':
- /* If there is no previous pattern... */
- if (!laststart) {
- if (syntax & RE_CONTEXT_INVALID_OPS)
- FREE_STACK_RETURN(REG_BADRPT);
- else if (!(syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
- }
-
- {
- /* Are we optimizing this jump? */
- boolean keep_string_p = false;
-
- /* 1 means zero (many) matches is allowed. */
- char zero_times_ok = 0, many_times_ok = 0;
-
- /* If there is a sequence of repetition chars, collapse it
- down to just one (the right one). We can't combine
- interval operators with these because of, e.g., `a{2}*',
- which should only match an even number of `a's. */
-
- for (;;) {
- zero_times_ok |= c != '+';
- many_times_ok |= c != '?';
-
- if (p == pend)
- break;
-
- PATFETCH(c);
-
- if (c == '*'
- || (!(syntax & RE_BK_PLUS_QM)
- && (c == '+' || c == '?')));
-
- else if (syntax & RE_BK_PLUS_QM && c == '\\') {
- if (p == pend)
- FREE_STACK_RETURN(REG_EESCAPE);
-
- PATFETCH(c1);
- if (!(c1 == '+' || c1 == '?')) {
- PATUNFETCH;
- PATUNFETCH;
- break;
- }
-
- c = c1;
- } else {
- PATUNFETCH;
- break;
- }
-
- /* If we get here, we found another repeat character. */
- }
-
- /* Star, etc. applied to an empty pattern is equivalent
- to an empty pattern. */
- if (!laststart)
- break;
-
- /* Now we know whether or not zero matches is allowed
- and also whether or not two or more matches is allowed. */
- if (many_times_ok) { /* More than one repetition is allowed, so put in at the
- end a backward relative jump from `b' to before the next
- jump we're going to put in below (which jumps from
- laststart to after this jump).
-
- But if we are at the `*' in the exact sequence `.*\n',
- insert an unconditional jump backwards to the .,
- instead of the beginning of the loop. This way we only
- push a failure point once, instead of every time
- through the loop. */
- assert(p - 1 > pattern);
-
- /* Allocate the space for the jump. */
- GET_BUFFER_SPACE(3);
-
- /* We know we are not at the first character of the pattern,
- because laststart was nonzero. And we've already
- incremented `p', by the way, to be the character after
- the `*'. Do we have to do something analogous here
- for null bytes, because of RE_DOT_NOT_NULL? */
- if (TRANSLATE(*(p - 2)) == TRANSLATE('.')
- && zero_times_ok
- && p < pend && TRANSLATE(*p) == TRANSLATE('\n')
- && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */
- STORE_JUMP(jump, b, laststart);
- keep_string_p = true;
- } else
- /* Anything else. */
- STORE_JUMP(maybe_pop_jump, b, laststart - 3);
-
- /* We've added more stuff to the buffer. */
- b += 3;
- }
-
- /* On failure, jump from laststart to b + 3, which will be the
- end of the buffer after this jump is inserted. */
- GET_BUFFER_SPACE(3);
- INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump
- : on_failure_jump, laststart, b + 3);
- pending_exact = 0;
- b += 3;
-
- if (!zero_times_ok) {
- /* At least one repetition is required, so insert a
- `dummy_failure_jump' before the initial
- `on_failure_jump' instruction of the loop. This
- effects a skip over that instruction the first time
- we hit that loop. */
- GET_BUFFER_SPACE(3);
- INSERT_JUMP(dummy_failure_jump, laststart,
- laststart + 6);
- b += 3;
- }
- }
- break;
-
-
- case '.':
- laststart = b;
- BUF_PUSH(anychar);
- break;
-
-
- case '[':
- {
- boolean had_char_class = false;
-
- if (p == pend)
- FREE_STACK_RETURN(REG_EBRACK);
-
- /* Ensure that we have enough space to push a charset: the
- opcode, the length count, and the bitset; 34 bytes in all. */
- GET_BUFFER_SPACE(34);
-
- laststart = b;
-
- /* We test `*p == '^' twice, instead of using an if
- statement, so we only need one BUF_PUSH. */
- BUF_PUSH(*p == '^' ? charset_not : charset);
- if (*p == '^')
- p++;
-
- /* Remember the first position in the bracket expression. */
- p1 = p;
-
- /* Push the number of bytes in the bitmap. */
- BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
-
- /* Clear the whole map. */
- bzero(b, (1 << BYTEWIDTH) / BYTEWIDTH);
-
- /* charset_not matches newline according to a syntax bit. */
- if ((re_opcode_t) b[-2] == charset_not
- && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT('\n');
-
- /* Read in characters and ranges, setting map bits. */
- for (;;) {
- if (p == pend)
- FREE_STACK_RETURN(REG_EBRACK);
-
- PATFETCH(c);
-
- /* \ might escape characters inside [...] and [^...]. */
- if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') {
- if (p == pend)
- FREE_STACK_RETURN(REG_EESCAPE);
-
- PATFETCH(c1);
- SET_LIST_BIT(c1);
- continue;
- }
-
- /* Could be the end of the bracket expression. If it's
- not (i.e., when the bracket expression is `[]' so
- far), the ']' character bit gets set way below. */
- if (c == ']' && p != p1 + 1)
- break;
-
- /* Look ahead to see if it's a range when the last thing
- was a character class. */
- if (had_char_class && c == '-' && *p != ']')
- FREE_STACK_RETURN(REG_ERANGE);
-
- /* Look ahead to see if it's a range when the last thing
- was a character: if this is a hyphen not at the
- beginning or the end of a list, then it's the range
- operator. */
- if (c == '-' && !(p - 2 >= pattern && p[-2] == '[')
- && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
- && *p != ']') {
- reg_errcode_t ret
- = compile_range(&p, pend, translate, syntax, b);
-
- if (ret != REG_NOERROR)
- FREE_STACK_RETURN(ret);
- }
-
- else if (p[0] == '-' && p[1] != ']') { /* This handles ranges made up of characters only. */
- reg_errcode_t ret;
-
- /* Move past the `-'. */
- PATFETCH(c1);
-
- ret = compile_range(&p, pend, translate, syntax, b);
- if (ret != REG_NOERROR)
- FREE_STACK_RETURN(ret);
- }
-
- /* See if we're at the beginning of a possible character
- class. */
-
- else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */
- char str[CHAR_CLASS_MAX_LENGTH + 1];
-
- PATFETCH(c);
- c1 = 0;
-
- /* If pattern is `[[:'. */
- if (p == pend)
- FREE_STACK_RETURN(REG_EBRACK);
-
- for (;;) {
- PATFETCH(c);
- if ((c == ':' && *p == ']') || p == pend)
- break;
- if (c1 < CHAR_CLASS_MAX_LENGTH)
- str[c1++] = c;
- else
- /* This is in any case an invalid class name. */
- str[0] = '\0';
- }
- str[c1] = '\0';
-
- /* If isn't a word bracketed by `[:' and `:]':
- undo the ending character, the letters, and leave
- the leading `:' and `[' (but set bits for them). */
- if (c == ':' && *p == ']') {
-#if defined _LIBC || WIDE_CHAR_SUPPORT
- boolean is_lower = STREQ(str, "lower");
- boolean is_upper = STREQ(str, "upper");
- wctype_t wt;
- int ch;
-
- wt = IS_CHAR_CLASS(str);
- if (wt == 0)
- FREE_STACK_RETURN(REG_ECTYPE);
-
- /* Throw away the ] at the end of the character
- class. */
- PATFETCH(c);
-
- if (p == pend)
- FREE_STACK_RETURN(REG_EBRACK);
-
- for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) {
-# ifdef _LIBC
- if (__iswctype(__btowc(ch), wt))
- SET_LIST_BIT(ch);
-# else
- if (iswctype(btowc(ch), wt))
- SET_LIST_BIT(ch);
-# endif
-
- if (translate && (is_upper || is_lower)
- && (ISUPPER(ch) || ISLOWER(ch)))
- SET_LIST_BIT(ch);
- }
-
- had_char_class = true;
-#else
- int ch;
- boolean is_alnum = STREQ(str, "alnum");
- boolean is_alpha = STREQ(str, "alpha");
- boolean is_blank = STREQ(str, "blank");
- boolean is_cntrl = STREQ(str, "cntrl");
- boolean is_digit = STREQ(str, "digit");
- boolean is_graph = STREQ(str, "graph");
- boolean is_lower = STREQ(str, "lower");
- boolean is_print = STREQ(str, "print");
- boolean is_punct = STREQ(str, "punct");
- boolean is_space = STREQ(str, "space");
- boolean is_upper = STREQ(str, "upper");
- boolean is_xdigit = STREQ(str, "xdigit");
-
- if (!IS_CHAR_CLASS(str))
- FREE_STACK_RETURN(REG_ECTYPE);
-
- /* Throw away the ] at the end of the character
- class. */
- PATFETCH(c);
-
- if (p == pend)
- FREE_STACK_RETURN(REG_EBRACK);
-
- for (ch = 0; ch < 1 << BYTEWIDTH; ch++) {
- /* This was split into 3 if's to
- avoid an arbitrary limit in some compiler. */
- if ((is_alnum && ISALNUM(ch))
- || (is_alpha && ISALPHA(ch))
- || (is_blank && ISBLANK(ch))
- || (is_cntrl && ISCNTRL(ch)))
- SET_LIST_BIT(ch);
- if ((is_digit && ISDIGIT(ch))
- || (is_graph && ISGRAPH(ch))
- || (is_lower && ISLOWER(ch))
- || (is_print && ISPRINT(ch)))
- SET_LIST_BIT(ch);
- if ((is_punct && ISPUNCT(ch))
- || (is_space && ISSPACE(ch))
- || (is_upper && ISUPPER(ch))
- || (is_xdigit && ISXDIGIT(ch)))
- SET_LIST_BIT(ch);
- if (translate && (is_upper || is_lower)
- && (ISUPPER(ch) || ISLOWER(ch)))
- SET_LIST_BIT(ch);
- }
- had_char_class = true;
-#endif /* libc || wctype.h */
- } else {
- c1++;
- while (c1--)
- PATUNFETCH;
- SET_LIST_BIT('[');
- SET_LIST_BIT(':');
- had_char_class = false;
- }
- } else {
- had_char_class = false;
- SET_LIST_BIT(c);
- }
- }
-
- /* Discard any (non)matching list bytes that are all 0 at the
- end of the map. Decrease the map-length byte too. */
- while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
- b[-1]--;
- b += b[-1];
- }
- break;
-
-
- case '(':
- if (syntax & RE_NO_BK_PARENS)
- goto handle_open;
- else
- goto normal_char;
-
-
- case ')':
- if (syntax & RE_NO_BK_PARENS)
- goto handle_close;
- else
- goto normal_char;
-
-
- case '\n':
- if (syntax & RE_NEWLINE_ALT)
- goto handle_alt;
- else
- goto normal_char;
-
-
- case '|':
- if (syntax & RE_NO_BK_VBAR)
- goto handle_alt;
- else
- goto normal_char;
-
-
- case '{':
- if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
- goto handle_interval;
- else
- goto normal_char;
-
-
- case '\\':
- if (p == pend)
- FREE_STACK_RETURN(REG_EESCAPE);
-
- /* Do not translate the character after the \, so that we can
- distinguish, e.g., \B from \b, even if we normally would
- translate, e.g., B to b. */
- PATFETCH_RAW(c);
-
- switch (c) {
- case '(':
- if (syntax & RE_NO_BK_PARENS)
- goto normal_backslash;
-
- handle_open:
- bufp->re_nsub++;
- regnum++;
-
- if (COMPILE_STACK_FULL) {
- RETALLOC(compile_stack.stack, compile_stack.size << 1,
- compile_stack_elt_t);
- if (compile_stack.stack == NULL)
- return REG_ESPACE;
-
- compile_stack.size <<= 1;
- }
-
- /* These are the values to restore when we hit end of this
- group. They are all relative offsets, so that if the
- whole pattern moves because of realloc, they will still
- be valid. */
- COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
- COMPILE_STACK_TOP.fixup_alt_jump
- =
- fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
- COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
- COMPILE_STACK_TOP.regnum = regnum;
-
- /* We will eventually replace the 0 with the number of
- groups inner to this one. But do not push a
- start_memory for groups beyond the last one we can
- represent in the compiled pattern. */
- if (regnum <= MAX_REGNUM) {
- COMPILE_STACK_TOP.inner_group_offset =
- b - bufp->buffer + 2;
- BUF_PUSH_3(start_memory, regnum, 0);
- }
-
- compile_stack.avail++;
-
- fixup_alt_jump = 0;
- laststart = 0;
- begalt = b;
- /* If we've reached MAX_REGNUM groups, then this open
- won't actually generate any code, so we'll have to
- clear pending_exact explicitly. */
- pending_exact = 0;
- break;
-
-
- case ')':
- if (syntax & RE_NO_BK_PARENS)
- goto normal_backslash;
-
- if (COMPILE_STACK_EMPTY) {
- if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
- goto normal_backslash;
- else
- FREE_STACK_RETURN(REG_ERPAREN);
- }
-
- handle_close:
- if (fixup_alt_jump) { /* Push a dummy failure point at the end of the
- alternative for a possible future
- `pop_failure_jump' to pop. See comments at
- `push_dummy_failure' in `re_match_2'. */
- BUF_PUSH(push_dummy_failure);
-
- /* We allocated space for this jump when we assigned
- to `fixup_alt_jump', in the `handle_alt' case below. */
- STORE_JUMP(jump_past_alt, fixup_alt_jump, b - 1);
- }
-
- /* See similar code for backslashed left paren above. */
- if (COMPILE_STACK_EMPTY) {
- if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
- goto normal_char;
- else
- FREE_STACK_RETURN(REG_ERPAREN);
- }
-
- /* Since we just checked for an empty stack above, this
- ``can't happen''. */
- assert(compile_stack.avail != 0);
- {
- /* We don't just want to restore into `regnum', because
- later groups should continue to be numbered higher,
- as in `(ab)c(de)' -- the second group is #2. */
- regnum_t this_group_regnum;
-
- compile_stack.avail--;
- begalt =
- bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
- fixup_alt_jump =
- COMPILE_STACK_TOP.fixup_alt_jump ? bufp->buffer +
- COMPILE_STACK_TOP.fixup_alt_jump - 1 : 0;
- laststart =
- bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
- this_group_regnum = COMPILE_STACK_TOP.regnum;
- /* If we've reached MAX_REGNUM groups, then this open
- won't actually generate any code, so we'll have to
- clear pending_exact explicitly. */
- pending_exact = 0;
-
- /* We're at the end of the group, so now we know how many
- groups were inside this one. */
- if (this_group_regnum <= MAX_REGNUM) {
- unsigned char *inner_group_loc
-
- =
- bufp->buffer +
- COMPILE_STACK_TOP.inner_group_offset;
-
- *inner_group_loc = regnum - this_group_regnum;
- BUF_PUSH_3(stop_memory, this_group_regnum,
- regnum - this_group_regnum);
- }
- }
- break;
-
-
- case '|': /* `\|'. */
- if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
- goto normal_backslash;
- handle_alt:
- if (syntax & RE_LIMITED_OPS)
- goto normal_char;
-
- /* Insert before the previous alternative a jump which
- jumps to this alternative if the former fails. */
- GET_BUFFER_SPACE(3);
- INSERT_JUMP(on_failure_jump, begalt, b + 6);
- pending_exact = 0;
- b += 3;
-
- /* The alternative before this one has a jump after it
- which gets executed if it gets matched. Adjust that
- jump so it will jump to this alternative's analogous
- jump (put in below, which in turn will jump to the next
- (if any) alternative's such jump, etc.). The last such
- jump jumps to the correct final destination. A picture:
- _____ _____
- | | | |
- | v | v
- a | b | c
-
- If we are at `b', then fixup_alt_jump right now points to a
- three-byte space after `a'. We'll put in the jump, set
- fixup_alt_jump to right after `b', and leave behind three
- bytes which we'll fill in when we get to after `c'. */
-
- if (fixup_alt_jump)
- STORE_JUMP(jump_past_alt, fixup_alt_jump, b);
-
- /* Mark and leave space for a jump after this alternative,
- to be filled in later either by next alternative or
- when know we're at the end of a series of alternatives. */
- fixup_alt_jump = b;
- GET_BUFFER_SPACE(3);
- b += 3;
-
- laststart = 0;
- begalt = b;
- break;
-
-
- case '{':
- /* If \{ is a literal. */
- if (!(syntax & RE_INTERVALS)
- /* If we're at `\{' and it's not the open-interval
- operator. */
- || ((syntax & RE_INTERVALS)
- && (syntax & RE_NO_BK_BRACES)) || (p - 2 == pattern
- && p == pend))
- goto normal_backslash;
-
- handle_interval:
- {
- /* If got here, then the syntax allows intervals. */
-
- /* At least (most) this many matches must be made. */
- int lower_bound = -1, upper_bound = -1;
-
- beg_interval = p - 1;
-
- if (p == pend) {
- if (!(syntax & RE_INTERVALS)
- && (syntax & RE_NO_BK_BRACES)) goto
- unfetch_interval;
- else
- FREE_STACK_RETURN(REG_EBRACE);
- }
-
- GET_UNSIGNED_NUMBER(lower_bound);
-
- if (c == ',') {
- GET_UNSIGNED_NUMBER(upper_bound);
- if ((!(syntax & RE_NO_BK_BRACES) && c != '\\')
- || ((syntax & RE_NO_BK_BRACES) && c != '}'))
- FREE_STACK_RETURN(REG_BADBR);
-
- if (upper_bound < 0)
- upper_bound = RE_DUP_MAX;
- } else
- /* Interval such as `{1}' => match exactly once. */
- upper_bound = lower_bound;
-
- if (lower_bound < 0 || upper_bound > RE_DUP_MAX
- || lower_bound > upper_bound) {
- if (!(syntax & RE_INTERVALS)
- && (syntax & RE_NO_BK_BRACES)) goto
- unfetch_interval;
- else
- FREE_STACK_RETURN(REG_BADBR);
- }
-
- if (!(syntax & RE_NO_BK_BRACES)) {
- if (c != '\\')
- FREE_STACK_RETURN(REG_EBRACE);
-
- PATFETCH(c);
- }
-
- if (c != '}') {
- if (!(syntax & RE_INTERVALS)
- && (syntax & RE_NO_BK_BRACES)) goto
- unfetch_interval;
- else
- FREE_STACK_RETURN(REG_BADBR);
- }
-
- /* We just parsed a valid interval. */
-
- /* If it's invalid to have no preceding re. */
- if (!laststart) {
- if (syntax & RE_CONTEXT_INVALID_OPS)
- FREE_STACK_RETURN(REG_BADRPT);
- else if (syntax & RE_CONTEXT_INDEP_OPS)
- laststart = b;
- else
- goto unfetch_interval;
- }
-
- /* If the upper bound is zero, don't want to succeed at
- all; jump from `laststart' to `b + 3', which will be
- the end of the buffer after we insert the jump. */
- if (upper_bound == 0) {
- GET_BUFFER_SPACE(3);
- INSERT_JUMP(jump, laststart, b + 3);
- b += 3;
- }
-
- /* Otherwise, we have a nontrivial interval. When
- we're all done, the pattern will look like:
- set_number_at <jump count> <upper bound>
- set_number_at <succeed_n count> <lower bound>
- succeed_n <after jump addr> <succeed_n count>
- <body of loop>
- jump_n <succeed_n addr> <jump count>
- (The upper bound and `jump_n' are omitted if
- `upper_bound' is 1, though.) */
- else { /* If the upper bound is > 1, we need to insert
- more at the end of the loop. */
- unsigned nbytes = 10 + (upper_bound > 1) * 10;
-
- GET_BUFFER_SPACE(nbytes);
-
- /* Initialize lower bound of the `succeed_n', even
- though it will be set during matching by its
- attendant `set_number_at' (inserted next),
- because `re_compile_fastmap' needs to know.
- Jump to the `jump_n' we might insert below. */
- INSERT_JUMP2(succeed_n, laststart,
- b + 5 + (upper_bound > 1) * 5,
- lower_bound);
- b += 5;
-
- /* Code to initialize the lower bound. Insert
- before the `succeed_n'. The `5' is the last two
- bytes of this `set_number_at', plus 3 bytes of
- the following `succeed_n'. */
- insert_op2(set_number_at, laststart, 5,
- lower_bound, b);
- b += 5;
-
- if (upper_bound > 1) { /* More than one repetition is allowed, so
- append a backward jump to the `succeed_n'
- that starts this interval.
-
- When we've reached this during matching,
- we'll have matched the interval once, so
- jump back only `upper_bound - 1' times. */
- STORE_JUMP2(jump_n, b, laststart + 5,
- upper_bound - 1);
- b += 5;
-
- /* The location we want to set is the second
- parameter of the `jump_n'; that is `b-2' as
- an absolute address. `laststart' will be
- the `set_number_at' we're about to insert;
- `laststart+3' the number to set, the source
- for the relative address. But we are
- inserting into the middle of the pattern --
- so everything is getting moved up by 5.
- Conclusion: (b - 2) - (laststart + 3) + 5,
- i.e., b - laststart.
-
- We insert this at the beginning of the loop
- so that if we fail during matching, we'll
- reinitialize the bounds. */
- insert_op2(set_number_at, laststart,
- b - laststart, upper_bound - 1, b);
- b += 5;
- }
- }
- pending_exact = 0;
- beg_interval = NULL;
- }
- break;
-
- unfetch_interval:
- /* If an invalid interval, match the characters as literals. */
- assert(beg_interval);
- p = beg_interval;
- beg_interval = NULL;
-
- /* normal_char and normal_backslash need `c'. */
- PATFETCH(c);
-
- if (!(syntax & RE_NO_BK_BRACES)) {
- if (p > pattern && p[-1] == '\\')
- goto normal_backslash;
- }
- goto normal_char;
-
-#ifdef emacs
- /* There is no way to specify the before_dot and after_dot
- operators. rms says this is ok. --karl */
- case '=':
- BUF_PUSH(at_dot);
- break;
-
- case 's':
- laststart = b;
- PATFETCH(c);
- BUF_PUSH_2(syntaxspec, syntax_spec_code[c]);
- break;
-
- case 'S':
- laststart = b;
- PATFETCH(c);
- BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]);
- break;
-#endif /* emacs */
-
-
- case 'w':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- laststart = b;
- BUF_PUSH(wordchar);
- break;
-
-
- case 'W':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- laststart = b;
- BUF_PUSH(notwordchar);
- break;
-
-
- case '<':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH(wordbeg);
- break;
-
- case '>':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH(wordend);
- break;
-
- case 'b':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH(wordbound);
- break;
-
- case 'B':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH(notwordbound);
- break;
-
- case '`':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH(begbuf);
- break;
-
- case '\'':
- if (syntax & RE_NO_GNU_OPS)
- goto normal_char;
- BUF_PUSH(endbuf);
- break;
-
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- if (syntax & RE_NO_BK_REFS)
- goto normal_char;
-
- c1 = c - '0';
-
- if (c1 > regnum)
- FREE_STACK_RETURN(REG_ESUBREG);
-
- /* Can't back reference to a subexpression if inside of it. */
- if (group_in_compile_stack(compile_stack, (regnum_t) c1))
- goto normal_char;
-
- laststart = b;
- BUF_PUSH_2(duplicate, c1);
- break;
-
-
- case '+':
- case '?':
- if (syntax & RE_BK_PLUS_QM)
- goto handle_plus;
- else
- goto normal_backslash;
-
- default:
- normal_backslash:
- /* You might think it would be useful for \ to mean
- not to translate; but if we don't translate it
- it will never match anything. */
- c = TRANSLATE(c);
- goto normal_char;
- }
- break;
-
-
- default:
- /* Expects the character in `c'. */
- normal_char:
- /* If no exactn currently being built. */
- if (!pending_exact
- /* If last exactn not at current position. */
- || pending_exact + *pending_exact + 1 != b
- /* We have only one byte following the exactn for the count. */
- || *pending_exact == (1 << BYTEWIDTH) - 1
- /* If followed by a repetition operator. */
- || *p == '*' || *p == '^' || ((syntax & RE_BK_PLUS_QM)
- ? *p == '\\' && (p[1] == '+'
- || p[1] ==
- '?') : (*p
- ==
- '+'
- ||
- *p
- ==
- '?'))
- || ((syntax & RE_INTERVALS)
- && ((syntax & RE_NO_BK_BRACES)
- ? *p == '{' : (p[0] == '\\' && p[1] == '{')))) {
- /* Start building a new exactn. */
-
- laststart = b;
-
- BUF_PUSH_2(exactn, 0);
- pending_exact = b - 1;
- }
-
- BUF_PUSH(c);
- (*pending_exact)++;
- break;
- } /* switch (c) */
- } /* while p != pend */
-
-
- /* Through the pattern now. */
-
- if (fixup_alt_jump)
- STORE_JUMP(jump_past_alt, fixup_alt_jump, b);
-
- if (!COMPILE_STACK_EMPTY)
- FREE_STACK_RETURN(REG_EPAREN);
-
- /* If we don't want backtracking, force success
- the first time we reach the end of the compiled pattern. */
- if (syntax & RE_NO_POSIX_BACKTRACKING)
- BUF_PUSH(succeed);
-
- free(compile_stack.stack);
-
- /* We have succeeded; set the length of the buffer. */
- bufp->used = b - bufp->buffer;
-
-#ifdef DEBUG
- if (debug) {
- DEBUG_PRINT1("\nCompiled pattern: \n");
- print_compiled_pattern(bufp);
- }
-#endif /* DEBUG */
-
-#ifndef MATCH_MAY_ALLOCATE
- /* Initialize the failure stack to the largest possible stack. This
- isn't necessary unless we're trying to avoid calling alloca in
- the search and match routines. */
- {
- int num_regs = bufp->re_nsub + 1;
-
- /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
- is strictly greater than re_max_failures, the largest possible stack
- is 2 * re_max_failures failure points. */
- if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) {
- fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
-
-# ifdef emacs
- if (!fail_stack.stack)
- fail_stack.stack
- = (fail_stack_elt_t *) xmalloc(fail_stack.size
- *
- sizeof
- (fail_stack_elt_t));
- else
- fail_stack.stack =
- (fail_stack_elt_t *) xrealloc(fail_stack.stack,
- (fail_stack.size *
- sizeof
- (fail_stack_elt_t)));
-# else /* not emacs */
- if (!fail_stack.stack)
- fail_stack.stack
- = (fail_stack_elt_t *) malloc(fail_stack.size
- *
- sizeof
- (fail_stack_elt_t));
- else
- fail_stack.stack =
- (fail_stack_elt_t *) realloc(fail_stack.stack,
- (fail_stack.size *
- sizeof
- (fail_stack_elt_t)));
-# endif /* not emacs */
- }
-
- regex_grow_registers(num_regs);
- }
-#endif /* not MATCH_MAY_ALLOCATE */
-
- return REG_NOERROR;
-} /* regex_compile */
-
/* Subroutines for `regex_compile'. */
/* Store OP at LOC followed by two-byte integer parameter ARG. */
-static void store_op1(op, loc, arg)
+static inline void store_op1(op, loc, arg)
re_opcode_t op;
unsigned char *loc;
int arg;
@@ -3068,6 +1940,10 @@ reg_syntax_t syntax;
/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
false if it's not. */
+static boolean group_in_compile_stack _RE_ARGS((compile_stack_type
+ compile_stack,
+ regnum_t regnum));
+
static boolean group_in_compile_stack(compile_stack, regnum)
compile_stack_type compile_stack;
regnum_t regnum;
@@ -5727,3 +4603,1126 @@ regex_t *preg;
weak_alias(__regfree, regfree)
#endif
#endif /* not emacs */
+
+/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
+ Returns one of error codes defined in `regex.h', or zero for success.
+
+ Assumes the `allocated' (and perhaps `buffer') and `translate'
+ fields are set in BUFP on entry.
+
+ If it succeeds, results are put in BUFP (if it returns an error, the
+ contents of BUFP are undefined):
+ `buffer' is the compiled pattern;
+ `syntax' is set to SYNTAX;
+ `used' is set to the length of the compiled pattern;
+ `fastmap_accurate' is zero;
+ `re_nsub' is the number of subexpressions in PATTERN;
+ `not_bol' and `not_eol' are zero;
+
+ The `fastmap' and `newline_anchor' fields are neither
+ examined nor set. */
+
+/* Return, freeing storage we allocated. */
+#define FREE_STACK_RETURN(value) \
+ return (free (compile_stack.stack), value)
+
+static reg_errcode_t regex_compile(pattern, size, syntax, bufp)
+const char *pattern;
+size_t size;
+reg_syntax_t syntax;
+struct re_pattern_buffer *bufp;
+{
+ /* We fetch characters from PATTERN here. Even though PATTERN is
+ `char *' (i.e., signed), we declare these variables as unsigned, so
+ they can be reliably used as array indices. */
+ register unsigned char c, c1;
+
+ /* A random temporary spot in PATTERN. */
+ const char *p1;
+
+ /* Points to the end of the buffer, where we should append. */
+ register unsigned char *b;
+
+ /* Keeps track of unclosed groups. */
+ compile_stack_type compile_stack;
+
+ /* Points to the current (ending) position in the pattern. */
+ const char *p = pattern;
+ const char *pend = pattern + size;
+
+ /* How to translate the characters in the pattern. */
+ RE_TRANSLATE_TYPE translate = bufp->translate;
+
+ /* Address of the count-byte of the most recently inserted `exactn'
+ command. This makes it possible to tell if a new exact-match
+ character can be added to that command or if the character requires
+ a new `exactn' command. */
+ unsigned char *pending_exact = 0;
+
+ /* Address of start of the most recently finished expression.
+ This tells, e.g., postfix * where to find the start of its
+ operand. Reset at the beginning of groups and alternatives. */
+ unsigned char *laststart = 0;
+
+ /* Address of beginning of regexp, or inside of last group. */
+ unsigned char *begalt;
+
+ /* Place in the uncompiled pattern (i.e., the {) to
+ which to go back if the interval is invalid. */
+ const char *beg_interval;
+
+ /* Address of the place where a forward jump should go to the end of
+ the containing expression. Each alternative of an `or' -- except the
+ last -- ends with a forward jump of this sort. */
+ unsigned char *fixup_alt_jump = 0;
+
+ /* Counts open-groups as they are encountered. Remembered for the
+ matching close-group on the compile stack, so the same register
+ number is put in the stop_memory as the start_memory. */
+ regnum_t regnum = 0;
+
+#ifdef DEBUG
+ DEBUG_PRINT1("\nCompiling pattern: ");
+ if (debug) {
+ unsigned debug_count;
+
+ for (debug_count = 0; debug_count < size; debug_count++)
+ putchar(pattern[debug_count]);
+ putchar('\n');
+ }
+#endif /* DEBUG */
+
+ /* Initialize the compile stack. */
+ compile_stack.stack =
+ TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
+ if (compile_stack.stack == NULL)
+ return REG_ESPACE;
+
+ compile_stack.size = INIT_COMPILE_STACK_SIZE;
+ compile_stack.avail = 0;
+
+ /* Initialize the pattern buffer. */
+ bufp->syntax = syntax;
+ bufp->fastmap_accurate = 0;
+ bufp->not_bol = bufp->not_eol = 0;
+
+ /* Set `used' to zero, so that if we return an error, the pattern
+ printer (for debugging) will think there's no pattern. We reset it
+ at the end. */
+ bufp->used = 0;
+
+ /* Always count groups, whether or not bufp->no_sub is set. */
+ bufp->re_nsub = 0;
+
+#if !defined emacs && !defined SYNTAX_TABLE
+ /* Initialize the syntax table. */
+ init_syntax_once();
+#endif
+
+ if (bufp->allocated == 0) {
+ if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc
+ enough space. This loses if buffer's address is bogus, but
+ that is the user's responsibility. */
+ RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
+ } else { /* Caller did not allocate a buffer. Do it for them. */
+ bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
+ }
+ if (!bufp->buffer)
+ FREE_STACK_RETURN(REG_ESPACE);
+
+ bufp->allocated = INIT_BUF_SIZE;
+ }
+
+ begalt = b = bufp->buffer;
+
+ /* Loop through the uncompiled pattern until we're at the end. */
+ while (p != pend) {
+ PATFETCH(c);
+
+ switch (c) {
+ case '^':
+ {
+ if ( /* If at start of pattern, it's an operator. */
+ p == pattern + 1
+ /* If context independent, it's an operator. */
+ || syntax & RE_CONTEXT_INDEP_ANCHORS
+ /* Otherwise, depends on what's come before. */
+ || at_begline_loc_p(pattern, p, syntax))
+ BUF_PUSH(begline);
+ else
+ goto normal_char;
+ }
+ break;
+
+
+ case '$':
+ {
+ if ( /* If at end of pattern, it's an operator. */
+ p == pend
+ /* If context independent, it's an operator. */
+ || syntax & RE_CONTEXT_INDEP_ANCHORS
+ /* Otherwise, depends on what's next. */
+ || at_endline_loc_p(p, pend, syntax))
+ BUF_PUSH(endline);
+ else
+ goto normal_char;
+ }
+ break;
+
+
+ case '+':
+ case '?':
+ if ((syntax & RE_BK_PLUS_QM)
+ || (syntax & RE_LIMITED_OPS))
+ goto normal_char;
+ handle_plus:
+ case '*':
+ /* If there is no previous pattern... */
+ if (!laststart) {
+ if (syntax & RE_CONTEXT_INVALID_OPS)
+ FREE_STACK_RETURN(REG_BADRPT);
+ else if (!(syntax & RE_CONTEXT_INDEP_OPS))
+ goto normal_char;
+ }
+
+ {
+ /* Are we optimizing this jump? */
+ boolean keep_string_p = false;
+
+ /* 1 means zero (many) matches is allowed. */
+ char zero_times_ok = 0, many_times_ok = 0;
+
+ /* If there is a sequence of repetition chars, collapse it
+ down to just one (the right one). We can't combine
+ interval operators with these because of, e.g., `a{2}*',
+ which should only match an even number of `a's. */
+
+ for (;;) {
+ zero_times_ok |= c != '+';
+ many_times_ok |= c != '?';
+
+ if (p == pend)
+ break;
+
+ PATFETCH(c);
+
+ if (c == '*'
+ || (!(syntax & RE_BK_PLUS_QM)
+ && (c == '+' || c == '?')));
+
+ else if (syntax & RE_BK_PLUS_QM && c == '\\') {
+ if (p == pend)
+ FREE_STACK_RETURN(REG_EESCAPE);
+
+ PATFETCH(c1);
+ if (!(c1 == '+' || c1 == '?')) {
+ PATUNFETCH;
+ PATUNFETCH;
+ break;
+ }
+
+ c = c1;
+ } else {
+ PATUNFETCH;
+ break;
+ }
+
+ /* If we get here, we found another repeat character. */
+ }
+
+ /* Star, etc. applied to an empty pattern is equivalent
+ to an empty pattern. */
+ if (!laststart)
+ break;
+
+ /* Now we know whether or not zero matches is allowed
+ and also whether or not two or more matches is allowed. */
+ if (many_times_ok) { /* More than one repetition is allowed, so put in at the
+ end a backward relative jump from `b' to before the next
+ jump we're going to put in below (which jumps from
+ laststart to after this jump).
+
+ But if we are at the `*' in the exact sequence `.*\n',
+ insert an unconditional jump backwards to the .,
+ instead of the beginning of the loop. This way we only
+ push a failure point once, instead of every time
+ through the loop. */
+ assert(p - 1 > pattern);
+
+ /* Allocate the space for the jump. */
+ GET_BUFFER_SPACE(3);
+
+ /* We know we are not at the first character of the pattern,
+ because laststart was nonzero. And we've already
+ incremented `p', by the way, to be the character after
+ the `*'. Do we have to do something analogous here
+ for null bytes, because of RE_DOT_NOT_NULL? */
+ if (TRANSLATE(*(p - 2)) == TRANSLATE('.')
+ && zero_times_ok
+ && p < pend && TRANSLATE(*p) == TRANSLATE('\n')
+ && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */
+ STORE_JUMP(jump, b, laststart);
+ keep_string_p = true;
+ } else
+ /* Anything else. */
+ STORE_JUMP(maybe_pop_jump, b, laststart - 3);
+
+ /* We've added more stuff to the buffer. */
+ b += 3;
+ }
+
+ /* On failure, jump from laststart to b + 3, which will be the
+ end of the buffer after this jump is inserted. */
+ GET_BUFFER_SPACE(3);
+ INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump
+ : on_failure_jump, laststart, b + 3);
+ pending_exact = 0;
+ b += 3;
+
+ if (!zero_times_ok) {
+ /* At least one repetition is required, so insert a
+ `dummy_failure_jump' before the initial
+ `on_failure_jump' instruction of the loop. This
+ effects a skip over that instruction the first time
+ we hit that loop. */
+ GET_BUFFER_SPACE(3);
+ INSERT_JUMP(dummy_failure_jump, laststart,
+ laststart + 6);
+ b += 3;
+ }
+ }
+ break;
+
+
+ case '.':
+ laststart = b;
+ BUF_PUSH(anychar);
+ break;
+
+
+ case '[':
+ {
+ boolean had_char_class = false;
+
+ if (p == pend)
+ FREE_STACK_RETURN(REG_EBRACK);
+
+ /* Ensure that we have enough space to push a charset: the
+ opcode, the length count, and the bitset; 34 bytes in all. */
+ GET_BUFFER_SPACE(34);
+
+ laststart = b;
+
+ /* We test `*p == '^' twice, instead of using an if
+ statement, so we only need one BUF_PUSH. */
+ BUF_PUSH(*p == '^' ? charset_not : charset);
+ if (*p == '^')
+ p++;
+
+ /* Remember the first position in the bracket expression. */
+ p1 = p;
+
+ /* Push the number of bytes in the bitmap. */
+ BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
+
+ /* Clear the whole map. */
+ bzero(b, (1 << BYTEWIDTH) / BYTEWIDTH);
+
+ /* charset_not matches newline according to a syntax bit. */
+ if ((re_opcode_t) b[-2] == charset_not
+ && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT('\n');
+
+ /* Read in characters and ranges, setting map bits. */
+ for (;;) {
+ if (p == pend)
+ FREE_STACK_RETURN(REG_EBRACK);
+
+ PATFETCH(c);
+
+ /* \ might escape characters inside [...] and [^...]. */
+ if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') {
+ if (p == pend)
+ FREE_STACK_RETURN(REG_EESCAPE);
+
+ PATFETCH(c1);
+ SET_LIST_BIT(c1);
+ continue;
+ }
+
+ /* Could be the end of the bracket expression. If it's
+ not (i.e., when the bracket expression is `[]' so
+ far), the ']' character bit gets set way below. */
+ if (c == ']' && p != p1 + 1)
+ break;
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character class. */
+ if (had_char_class && c == '-' && *p != ']')
+ FREE_STACK_RETURN(REG_ERANGE);
+
+ /* Look ahead to see if it's a range when the last thing
+ was a character: if this is a hyphen not at the
+ beginning or the end of a list, then it's the range
+ operator. */
+ if (c == '-' && !(p - 2 >= pattern && p[-2] == '[')
+ && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
+ && *p != ']') {
+ reg_errcode_t ret
+ = compile_range(&p, pend, translate, syntax, b);
+
+ if (ret != REG_NOERROR)
+ FREE_STACK_RETURN(ret);
+ }
+
+ else if (p[0] == '-' && p[1] != ']') { /* This handles ranges made up of characters only. */
+ reg_errcode_t ret;
+
+ /* Move past the `-'. */
+ PATFETCH(c1);
+
+ ret = compile_range(&p, pend, translate, syntax, b);
+ if (ret != REG_NOERROR)
+ FREE_STACK_RETURN(ret);
+ }
+
+ /* See if we're at the beginning of a possible character
+ class. */
+
+ else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */
+ char str[CHAR_CLASS_MAX_LENGTH + 1];
+
+ PATFETCH(c);
+ c1 = 0;
+
+ /* If pattern is `[[:'. */
+ if (p == pend)
+ FREE_STACK_RETURN(REG_EBRACK);
+
+ for (;;) {
+ PATFETCH(c);
+ if ((c == ':' && *p == ']') || p == pend)
+ break;
+ if (c1 < CHAR_CLASS_MAX_LENGTH)
+ str[c1++] = c;
+ else
+ /* This is in any case an invalid class name. */
+ str[0] = '\0';
+ }
+ str[c1] = '\0';
+
+ /* If isn't a word bracketed by `[:' and `:]':
+ undo the ending character, the letters, and leave
+ the leading `:' and `[' (but set bits for them). */
+ if (c == ':' && *p == ']') {
+#if defined _LIBC || WIDE_CHAR_SUPPORT
+ boolean is_lower = STREQ(str, "lower");
+ boolean is_upper = STREQ(str, "upper");
+ wctype_t wt;
+ int ch;
+
+ wt = IS_CHAR_CLASS(str);
+ if (wt == 0)
+ FREE_STACK_RETURN(REG_ECTYPE);
+
+ /* Throw away the ] at the end of the character
+ class. */
+ PATFETCH(c);
+
+ if (p == pend)
+ FREE_STACK_RETURN(REG_EBRACK);
+
+ for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) {
+# ifdef _LIBC
+ if (__iswctype(__btowc(ch), wt))
+ SET_LIST_BIT(ch);
+# else
+ if (iswctype(btowc(ch), wt))
+ SET_LIST_BIT(ch);
+# endif
+
+ if (translate && (is_upper || is_lower)
+ && (ISUPPER(ch) || ISLOWER(ch)))
+ SET_LIST_BIT(ch);
+ }
+
+ had_char_class = true;
+#else
+ int ch;
+ boolean is_alnum = STREQ(str, "alnum");
+ boolean is_alpha = STREQ(str, "alpha");
+ boolean is_blank = STREQ(str, "blank");
+ boolean is_cntrl = STREQ(str, "cntrl");
+ boolean is_digit = STREQ(str, "digit");
+ boolean is_graph = STREQ(str, "graph");
+ boolean is_lower = STREQ(str, "lower");
+ boolean is_print = STREQ(str, "print");
+ boolean is_punct = STREQ(str, "punct");
+ boolean is_space = STREQ(str, "space");
+ boolean is_upper = STREQ(str, "upper");
+ boolean is_xdigit = STREQ(str, "xdigit");
+
+ if (!IS_CHAR_CLASS(str))
+ FREE_STACK_RETURN(REG_ECTYPE);
+
+ /* Throw away the ] at the end of the character
+ class. */
+ PATFETCH(c);
+
+ if (p == pend)
+ FREE_STACK_RETURN(REG_EBRACK);
+
+ for (ch = 0; ch < 1 << BYTEWIDTH; ch++) {
+ /* This was split into 3 if's to
+ avoid an arbitrary limit in some compiler. */
+ if ((is_alnum && ISALNUM(ch))
+ || (is_alpha && ISALPHA(ch))
+ || (is_blank && ISBLANK(ch))
+ || (is_cntrl && ISCNTRL(ch)))
+ SET_LIST_BIT(ch);
+ if ((is_digit && ISDIGIT(ch))
+ || (is_graph && ISGRAPH(ch))
+ || (is_lower && ISLOWER(ch))
+ || (is_print && ISPRINT(ch)))
+ SET_LIST_BIT(ch);
+ if ((is_punct && ISPUNCT(ch))
+ || (is_space && ISSPACE(ch))
+ || (is_upper && ISUPPER(ch))
+ || (is_xdigit && ISXDIGIT(ch)))
+ SET_LIST_BIT(ch);
+ if (translate && (is_upper || is_lower)
+ && (ISUPPER(ch) || ISLOWER(ch)))
+ SET_LIST_BIT(ch);
+ }
+ had_char_class = true;
+#endif /* libc || wctype.h */
+ } else {
+ c1++;
+ while (c1--)
+ PATUNFETCH;
+ SET_LIST_BIT('[');
+ SET_LIST_BIT(':');
+ had_char_class = false;
+ }
+ } else {
+ had_char_class = false;
+ SET_LIST_BIT(c);
+ }
+ }
+
+ /* Discard any (non)matching list bytes that are all 0 at the
+ end of the map. Decrease the map-length byte too. */
+ while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
+ b[-1]--;
+ b += b[-1];
+ }
+ break;
+
+
+ case '(':
+ if (syntax & RE_NO_BK_PARENS)
+ goto handle_open;
+ else
+ goto normal_char;
+
+
+ case ')':
+ if (syntax & RE_NO_BK_PARENS)
+ goto handle_close;
+ else
+ goto normal_char;
+
+
+ case '\n':
+ if (syntax & RE_NEWLINE_ALT)
+ goto handle_alt;
+ else
+ goto normal_char;
+
+
+ case '|':
+ if (syntax & RE_NO_BK_VBAR)
+ goto handle_alt;
+ else
+ goto normal_char;
+
+
+ case '{':
+ if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
+ goto handle_interval;
+ else
+ goto normal_char;
+
+
+ case '\\':
+ if (p == pend)
+ FREE_STACK_RETURN(REG_EESCAPE);
+
+ /* Do not translate the character after the \, so that we can
+ distinguish, e.g., \B from \b, even if we normally would
+ translate, e.g., B to b. */
+ PATFETCH_RAW(c);
+
+ switch (c) {
+ case '(':
+ if (syntax & RE_NO_BK_PARENS)
+ goto normal_backslash;
+
+ handle_open:
+ bufp->re_nsub++;
+ regnum++;
+
+ if (COMPILE_STACK_FULL) {
+ RETALLOC(compile_stack.stack, compile_stack.size << 1,
+ compile_stack_elt_t);
+ if (compile_stack.stack == NULL)
+ return REG_ESPACE;
+
+ compile_stack.size <<= 1;
+ }
+
+ /* These are the values to restore when we hit end of this
+ group. They are all relative offsets, so that if the
+ whole pattern moves because of realloc, they will still
+ be valid. */
+ COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
+ COMPILE_STACK_TOP.fixup_alt_jump
+ =
+ fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
+ COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
+ COMPILE_STACK_TOP.regnum = regnum;
+
+ /* We will eventually replace the 0 with the number of
+ groups inner to this one. But do not push a
+ start_memory for groups beyond the last one we can
+ represent in the compiled pattern. */
+ if (regnum <= MAX_REGNUM) {
+ COMPILE_STACK_TOP.inner_group_offset =
+ b - bufp->buffer + 2;
+ BUF_PUSH_3(start_memory, regnum, 0);
+ }
+
+ compile_stack.avail++;
+
+ fixup_alt_jump = 0;
+ laststart = 0;
+ begalt = b;
+ /* If we've reached MAX_REGNUM groups, then this open
+ won't actually generate any code, so we'll have to
+ clear pending_exact explicitly. */
+ pending_exact = 0;
+ break;
+
+
+ case ')':
+ if (syntax & RE_NO_BK_PARENS)
+ goto normal_backslash;
+
+ if (COMPILE_STACK_EMPTY) {
+ if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
+ goto normal_backslash;
+ else
+ FREE_STACK_RETURN(REG_ERPAREN);
+ }
+
+ handle_close:
+ if (fixup_alt_jump) { /* Push a dummy failure point at the end of the
+ alternative for a possible future
+ `pop_failure_jump' to pop. See comments at
+ `push_dummy_failure' in `re_match_2'. */
+ BUF_PUSH(push_dummy_failure);
+
+ /* We allocated space for this jump when we assigned
+ to `fixup_alt_jump', in the `handle_alt' case below. */
+ STORE_JUMP(jump_past_alt, fixup_alt_jump, b - 1);
+ }
+
+ /* See similar code for backslashed left paren above. */
+ if (COMPILE_STACK_EMPTY) {
+ if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
+ goto normal_char;
+ else
+ FREE_STACK_RETURN(REG_ERPAREN);
+ }
+
+ /* Since we just checked for an empty stack above, this
+ ``can't happen''. */
+ assert(compile_stack.avail != 0);
+ {
+ /* We don't just want to restore into `regnum', because
+ later groups should continue to be numbered higher,
+ as in `(ab)c(de)' -- the second group is #2. */
+ regnum_t this_group_regnum;
+
+ compile_stack.avail--;
+ begalt =
+ bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
+ fixup_alt_jump =
+ COMPILE_STACK_TOP.fixup_alt_jump ? bufp->buffer +
+ COMPILE_STACK_TOP.fixup_alt_jump - 1 : 0;
+ laststart =
+ bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
+ this_group_regnum = COMPILE_STACK_TOP.regnum;
+ /* If we've reached MAX_REGNUM groups, then this open
+ won't actually generate any code, so we'll have to
+ clear pending_exact explicitly. */
+ pending_exact = 0;
+
+ /* We're at the end of the group, so now we know how many
+ groups were inside this one. */
+ if (this_group_regnum <= MAX_REGNUM) {
+ unsigned char *inner_group_loc
+
+ =
+ bufp->buffer +
+ COMPILE_STACK_TOP.inner_group_offset;
+
+ *inner_group_loc = regnum - this_group_regnum;
+ BUF_PUSH_3(stop_memory, this_group_regnum,
+ regnum - this_group_regnum);
+ }
+ }
+ break;
+
+
+ case '|': /* `\|'. */
+ if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
+ goto normal_backslash;
+ handle_alt:
+ if (syntax & RE_LIMITED_OPS)
+ goto normal_char;
+
+ /* Insert before the previous alternative a jump which
+ jumps to this alternative if the former fails. */
+ GET_BUFFER_SPACE(3);
+ INSERT_JUMP(on_failure_jump, begalt, b + 6);
+ pending_exact = 0;
+ b += 3;
+
+ /* The alternative before this one has a jump after it
+ which gets executed if it gets matched. Adjust that
+ jump so it will jump to this alternative's analogous
+ jump (put in below, which in turn will jump to the next
+ (if any) alternative's such jump, etc.). The last such
+ jump jumps to the correct final destination. A picture:
+ _____ _____
+ | | | |
+ | v | v
+ a | b | c
+
+ If we are at `b', then fixup_alt_jump right now points to a
+ three-byte space after `a'. We'll put in the jump, set
+ fixup_alt_jump to right after `b', and leave behind three
+ bytes which we'll fill in when we get to after `c'. */
+
+ if (fixup_alt_jump)
+ STORE_JUMP(jump_past_alt, fixup_alt_jump, b);
+
+ /* Mark and leave space for a jump after this alternative,
+ to be filled in later either by next alternative or
+ when know we're at the end of a series of alternatives. */
+ fixup_alt_jump = b;
+ GET_BUFFER_SPACE(3);
+ b += 3;
+
+ laststart = 0;
+ begalt = b;
+ break;
+
+
+ case '{':
+ /* If \{ is a literal. */
+ if (!(syntax & RE_INTERVALS)
+ /* If we're at `\{' and it's not the open-interval
+ operator. */
+ || ((syntax & RE_INTERVALS)
+ && (syntax & RE_NO_BK_BRACES)) || (p - 2 == pattern
+ && p == pend))
+ goto normal_backslash;
+
+ handle_interval:
+ {
+ /* If got here, then the syntax allows intervals. */
+
+ /* At least (most) this many matches must be made. */
+ int lower_bound = -1, upper_bound = -1;
+
+ beg_interval = p - 1;
+
+ if (p == pend) {
+ if (!(syntax & RE_INTERVALS)
+ && (syntax & RE_NO_BK_BRACES)) goto
+ unfetch_interval;
+ else
+ FREE_STACK_RETURN(REG_EBRACE);
+ }
+
+ GET_UNSIGNED_NUMBER(lower_bound);
+
+ if (c == ',') {
+ GET_UNSIGNED_NUMBER(upper_bound);
+ if ((!(syntax & RE_NO_BK_BRACES) && c != '\\')
+ || ((syntax & RE_NO_BK_BRACES) && c != '}'))
+ FREE_STACK_RETURN(REG_BADBR);
+
+ if (upper_bound < 0)
+ upper_bound = RE_DUP_MAX;
+ } else
+ /* Interval such as `{1}' => match exactly once. */
+ upper_bound = lower_bound;
+
+ if (lower_bound < 0 || upper_bound > RE_DUP_MAX
+ || lower_bound > upper_bound) {
+ if (!(syntax & RE_INTERVALS)
+ && (syntax & RE_NO_BK_BRACES)) goto
+ unfetch_interval;
+ else
+ FREE_STACK_RETURN(REG_BADBR);
+ }
+
+ if (!(syntax & RE_NO_BK_BRACES)) {
+ if (c != '\\')
+ FREE_STACK_RETURN(REG_EBRACE);
+
+ PATFETCH(c);
+ }
+
+ if (c != '}') {
+ if (!(syntax & RE_INTERVALS)
+ && (syntax & RE_NO_BK_BRACES)) goto
+ unfetch_interval;
+ else
+ FREE_STACK_RETURN(REG_BADBR);
+ }
+
+ /* We just parsed a valid interval. */
+
+ /* If it's invalid to have no preceding re. */
+ if (!laststart) {
+ if (syntax & RE_CONTEXT_INVALID_OPS)
+ FREE_STACK_RETURN(REG_BADRPT);
+ else if (syntax & RE_CONTEXT_INDEP_OPS)
+ laststart = b;
+ else
+ goto unfetch_interval;
+ }
+
+ /* If the upper bound is zero, don't want to succeed at
+ all; jump from `laststart' to `b + 3', which will be
+ the end of the buffer after we insert the jump. */
+ if (upper_bound == 0) {
+ GET_BUFFER_SPACE(3);
+ INSERT_JUMP(jump, laststart, b + 3);
+ b += 3;
+ }
+
+ /* Otherwise, we have a nontrivial interval. When
+ we're all done, the pattern will look like:
+ set_number_at <jump count> <upper bound>
+ set_number_at <succeed_n count> <lower bound>
+ succeed_n <after jump addr> <succeed_n count>
+ <body of loop>
+ jump_n <succeed_n addr> <jump count>
+ (The upper bound and `jump_n' are omitted if
+ `upper_bound' is 1, though.) */
+ else { /* If the upper bound is > 1, we need to insert
+ more at the end of the loop. */
+ unsigned nbytes = 10 + (upper_bound > 1) * 10;
+
+ GET_BUFFER_SPACE(nbytes);
+
+ /* Initialize lower bound of the `succeed_n', even
+ though it will be set during matching by its
+ attendant `set_number_at' (inserted next),
+ because `re_compile_fastmap' needs to know.
+ Jump to the `jump_n' we might insert below. */
+ INSERT_JUMP2(succeed_n, laststart,
+ b + 5 + (upper_bound > 1) * 5,
+ lower_bound);
+ b += 5;
+
+ /* Code to initialize the lower bound. Insert
+ before the `succeed_n'. The `5' is the last two
+ bytes of this `set_number_at', plus 3 bytes of
+ the following `succeed_n'. */
+ insert_op2(set_number_at, laststart, 5,
+ lower_bound, b);
+ b += 5;
+
+ if (upper_bound > 1) { /* More than one repetition is allowed, so
+ append a backward jump to the `succeed_n'
+ that starts this interval.
+
+ When we've reached this during matching,
+ we'll have matched the interval once, so
+ jump back only `upper_bound - 1' times. */
+ STORE_JUMP2(jump_n, b, laststart + 5,
+ upper_bound - 1);
+ b += 5;
+
+ /* The location we want to set is the second
+ parameter of the `jump_n'; that is `b-2' as
+ an absolute address. `laststart' will be
+ the `set_number_at' we're about to insert;
+ `laststart+3' the number to set, the source
+ for the relative address. But we are
+ inserting into the middle of the pattern --
+ so everything is getting moved up by 5.
+ Conclusion: (b - 2) - (laststart + 3) + 5,
+ i.e., b - laststart.
+
+ We insert this at the beginning of the loop
+ so that if we fail during matching, we'll
+ reinitialize the bounds. */
+ insert_op2(set_number_at, laststart,
+ b - laststart, upper_bound - 1, b);
+ b += 5;
+ }
+ }
+ pending_exact = 0;
+ beg_interval = NULL;
+ }
+ break;
+
+ unfetch_interval:
+ /* If an invalid interval, match the characters as literals. */
+ assert(beg_interval);
+ p = beg_interval;
+ beg_interval = NULL;
+
+ /* normal_char and normal_backslash need `c'. */
+ PATFETCH(c);
+
+ if (!(syntax & RE_NO_BK_BRACES)) {
+ if (p > pattern && p[-1] == '\\')
+ goto normal_backslash;
+ }
+ goto normal_char;
+
+#ifdef emacs
+ /* There is no way to specify the before_dot and after_dot
+ operators. rms says this is ok. --karl */
+ case '=':
+ BUF_PUSH(at_dot);
+ break;
+
+ case 's':
+ laststart = b;
+ PATFETCH(c);
+ BUF_PUSH_2(syntaxspec, syntax_spec_code[c]);
+ break;
+
+ case 'S':
+ laststart = b;
+ PATFETCH(c);
+ BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]);
+ break;
+#endif /* emacs */
+
+
+ case 'w':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ laststart = b;
+ BUF_PUSH(wordchar);
+ break;
+
+
+ case 'W':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ laststart = b;
+ BUF_PUSH(notwordchar);
+ break;
+
+
+ case '<':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ BUF_PUSH(wordbeg);
+ break;
+
+ case '>':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ BUF_PUSH(wordend);
+ break;
+
+ case 'b':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ BUF_PUSH(wordbound);
+ break;
+
+ case 'B':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ BUF_PUSH(notwordbound);
+ break;
+
+ case '`':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ BUF_PUSH(begbuf);
+ break;
+
+ case '\'':
+ if (syntax & RE_NO_GNU_OPS)
+ goto normal_char;
+ BUF_PUSH(endbuf);
+ break;
+
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (syntax & RE_NO_BK_REFS)
+ goto normal_char;
+
+ c1 = c - '0';
+
+ if (c1 > regnum)
+ FREE_STACK_RETURN(REG_ESUBREG);
+
+ /* Can't back reference to a subexpression if inside of it. */
+ if (group_in_compile_stack(compile_stack, (regnum_t) c1))
+ goto normal_char;
+
+ laststart = b;
+ BUF_PUSH_2(duplicate, c1);
+ break;
+
+
+ case '+':
+ case '?':
+ if (syntax & RE_BK_PLUS_QM)
+ goto handle_plus;
+ else
+ goto normal_backslash;
+
+ default:
+ normal_backslash:
+ /* You might think it would be useful for \ to mean
+ not to translate; but if we don't translate it
+ it will never match anything. */
+ c = TRANSLATE(c);
+ goto normal_char;
+ }
+ break;
+
+
+ default:
+ /* Expects the character in `c'. */
+ normal_char:
+ /* If no exactn currently being built. */
+ if (!pending_exact
+ /* If last exactn not at current position. */
+ || pending_exact + *pending_exact + 1 != b
+ /* We have only one byte following the exactn for the count. */
+ || *pending_exact == (1 << BYTEWIDTH) - 1
+ /* If followed by a repetition operator. */
+ || *p == '*' || *p == '^' || ((syntax & RE_BK_PLUS_QM)
+ ? *p == '\\' && (p[1] == '+'
+ || p[1] ==
+ '?') : (*p
+ ==
+ '+'
+ ||
+ *p
+ ==
+ '?'))
+ || ((syntax & RE_INTERVALS)
+ && ((syntax & RE_NO_BK_BRACES)
+ ? *p == '{' : (p[0] == '\\' && p[1] == '{')))) {
+ /* Start building a new exactn. */
+
+ laststart = b;
+
+ BUF_PUSH_2(exactn, 0);
+ pending_exact = b - 1;
+ }
+
+ BUF_PUSH(c);
+ (*pending_exact)++;
+ break;
+ } /* switch (c) */
+ } /* while p != pend */
+
+
+ /* Through the pattern now. */
+
+ if (fixup_alt_jump)
+ STORE_JUMP(jump_past_alt, fixup_alt_jump, b);
+
+ if (!COMPILE_STACK_EMPTY)
+ FREE_STACK_RETURN(REG_EPAREN);
+
+ /* If we don't want backtracking, force success
+ the first time we reach the end of the compiled pattern. */
+ if (syntax & RE_NO_POSIX_BACKTRACKING)
+ BUF_PUSH(succeed);
+
+ free(compile_stack.stack);
+
+ /* We have succeeded; set the length of the buffer. */
+ bufp->used = b - bufp->buffer;
+
+#ifdef DEBUG
+ if (debug) {
+ DEBUG_PRINT1("\nCompiled pattern: \n");
+ print_compiled_pattern(bufp);
+ }
+#endif /* DEBUG */
+
+#ifndef MATCH_MAY_ALLOCATE
+ /* Initialize the failure stack to the largest possible stack. This
+ isn't necessary unless we're trying to avoid calling alloca in
+ the search and match routines. */
+ {
+ int num_regs = bufp->re_nsub + 1;
+
+ /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
+ is strictly greater than re_max_failures, the largest possible stack
+ is 2 * re_max_failures failure points. */
+ if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) {
+ fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
+
+# ifdef emacs
+ if (!fail_stack.stack)
+ fail_stack.stack
+ = (fail_stack_elt_t *) xmalloc(fail_stack.size
+ *
+ sizeof
+ (fail_stack_elt_t));
+ else
+ fail_stack.stack =
+ (fail_stack_elt_t *) xrealloc(fail_stack.stack,
+ (fail_stack.size *
+ sizeof
+ (fail_stack_elt_t)));
+# else /* not emacs */
+ if (!fail_stack.stack)
+ fail_stack.stack
+ = (fail_stack_elt_t *) malloc(fail_stack.size
+ *
+ sizeof
+ (fail_stack_elt_t));
+ else
+ fail_stack.stack =
+ (fail_stack_elt_t *) realloc(fail_stack.stack,
+ (fail_stack.size *
+ sizeof
+ (fail_stack_elt_t)));
+# endif /* not emacs */
+ }
+
+ regex_grow_registers(num_regs);
+ }
+#endif /* not MATCH_MAY_ALLOCATE */
+
+ return REG_NOERROR;
+} /* regex_compile */