From 82d766043c6a8dcf6283788419f110dd7ab52f80 Mon Sep 17 00:00:00 2001 From: Eric Andersen <andersen@codepoet.org> Date: Fri, 20 Oct 2000 03:48:11 +0000 Subject: A smaller, kinder, gentler regexp implementation. --- libc/misc/regex/Makefile | 2 +- libc/misc/regex/regex.c | 5725 ++++++++++++++++++++++++++++++++++++ libc/misc/regex/rx.c | 7273 ---------------------------------------------- 3 files changed, 5726 insertions(+), 7274 deletions(-) create mode 100644 libc/misc/regex/regex.c delete mode 100644 libc/misc/regex/rx.c (limited to 'libc') diff --git a/libc/misc/regex/Makefile b/libc/misc/regex/Makefile index c4c13f6cf..38b7e98bf 100644 --- a/libc/misc/regex/Makefile +++ b/libc/misc/regex/Makefile @@ -24,7 +24,7 @@ TOPDIR=../../ include $(TOPDIR)Rules.mak LIBC=$(TOPDIR)libc.a -CSRC=rx.c +CSRC=regex.c COBJS=$(patsubst %.c,%.o, $(CSRC)) OBJS=$(COBJS) diff --git a/libc/misc/regex/regex.c b/libc/misc/regex/regex.c new file mode 100644 index 000000000..64e754ee0 --- /dev/null +++ b/libc/misc/regex/regex.c @@ -0,0 +1,5725 @@ +/* Extended regular expression matching and search library, + version 0.12. + (Implements POSIX draft P1003.2/D11.2, except for some of the + internationalization features.) + Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* AIX requires this to be the first thing in the file. */ +#if defined _AIX && !defined REGEX_MALLOC +#pragma alloca +#endif + +#undef _GNU_SOURCE +#define _GNU_SOURCE +#define STDC_HEADERS + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#ifndef PARAMS +# if defined __GNUC__ || (defined __STDC__ && __STDC__) +# define PARAMS(args) args +# else +# define PARAMS(args) () +# endif /* GCC. */ +#endif /* Not PARAMS. */ + +#if defined STDC_HEADERS && !defined emacs +# include <stddef.h> +#else +/* We need this for `regex.h', and perhaps for the Emacs include files. */ +# include <sys/types.h> +#endif + +#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) + +/* For platform which support the ISO C amendement 1 functionality we + support user defined character classes. */ +#if defined _LIBC || WIDE_CHAR_SUPPORT +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ +# include <wchar.h> +# include <wctype.h> +#endif + +#ifdef _LIBC +/* We have to keep the namespace clean. */ +# define regfree(preg) __regfree (preg) +# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) +# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) +# define regerror(errcode, preg, errbuf, errbuf_size) \ + __regerror(errcode, preg, errbuf, errbuf_size) +# define re_set_registers(bu, re, nu, st, en) \ + __re_set_registers (bu, re, nu, st, en) +# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ + __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) +# define re_match(bufp, string, size, pos, regs) \ + __re_match (bufp, string, size, pos, regs) +# define re_search(bufp, string, size, startpos, range, regs) \ + __re_search (bufp, string, size, startpos, range, regs) +# define re_compile_pattern(pattern, length, bufp) \ + __re_compile_pattern (pattern, length, bufp) +# define re_set_syntax(syntax) __re_set_syntax (syntax) +# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ + __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) +# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) + +#define btowc __btowc +#endif + +/* This is for other GNU distributions with internationalized messages. */ +#if HAVE_LIBINTL_H || defined _LIBC +# include <libintl.h> +#else +# define gettext(msgid) (msgid) +#endif + +#ifndef gettext_noop +/* This define is so xgettext can find the internationalizable + strings. */ +# define gettext_noop(String) String +#endif + +/* The `emacs' switch turns on certain matching commands + that make sense only in Emacs. */ +#ifdef emacs + +# include "lisp.h" +# include "buffer.h" +# include "syntax.h" + +#else /* not emacs */ + +/* If we are not linking with Emacs proper, + we can't use the relocating allocator + even if config.h says that we can. */ +# undef REL_ALLOC + +# if defined STDC_HEADERS || defined _LIBC +# include <stdlib.h> +# else +char *malloc(); +char *realloc(); +# endif + +/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. + If nothing else has been done, use the method below. */ +# ifdef INHIBIT_STRING_HEADER +# if !(defined HAVE_BZERO && defined HAVE_BCOPY) +# if !defined bzero && !defined bcopy +# undef INHIBIT_STRING_HEADER +# endif +# endif +# endif + +/* This is the normal way of making sure we have a bcopy and a bzero. + This is used in most programs--a few other programs avoid this + by defining INHIBIT_STRING_HEADER. */ +# ifndef INHIBIT_STRING_HEADER +# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC +# include <string.h> +# ifndef bzero +# ifndef _LIBC +# define bzero(s, n) (memset (s, '\0', n), (s)) +# else +# define bzero(s, n) __bzero (s, n) +# endif +# endif +# else +# include <strings.h> +# ifndef memcmp +# define memcmp(s1, s2, n) bcmp (s1, s2, n) +# endif +# ifndef memcpy +# define memcpy(d, s, n) (bcopy (s, d, n), (d)) +# endif +# endif +# endif + +/* Define the syntax stuff for \<, \>, etc. */ + +/* This must be nonzero for the wordchar and notwordchar pattern + commands in re_match_2. */ +# ifndef Sword +# define Sword 1 +# endif + +# ifdef SWITCH_ENUM_BUG +# define SWITCH_ENUM_CAST(x) ((int)(x)) +# else +# define SWITCH_ENUM_CAST(x) (x) +# endif + +#endif /* not emacs */ + +/* Get the interface, including the syntax bits. */ +#include <regex.h> + +/* isalpha etc. are used for the character classes. */ +#include <ctype.h> + +/* Jim Meyering writes: + + "... Some ctype macros are valid only for character codes that + isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when + using /bin/cc or gcc but without giving an ansi option). So, all + ctype uses should be through macros like ISPRINT... If + STDC_HEADERS is defined, then autoconf has verified that the ctype + macros don't need to be guarded with references to isascii. ... + Defining isascii to 1 should let any compiler worth its salt + eliminate the && through constant folding." + Solaris defines some of these symbols so we must undefine them first. */ + +#undef ISASCII +#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) +# define ISASCII(c) 1 +#else +# define ISASCII(c) isascii(c) +#endif + +#ifdef isblank +# define ISBLANK(c) (ISASCII (c) && isblank (c)) +#else +# define ISBLANK(c) ((c) == ' ' || (c) == '\t') +#endif +#ifdef isgraph +# define ISGRAPH(c) (ISASCII (c) && isgraph (c)) +#else +# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) +#endif + +#undef ISPRINT +#define ISPRINT(c) (ISASCII (c) && isprint (c)) +#define ISDIGIT(c) (ISASCII (c) && isdigit (c)) +#define ISALNUM(c) (ISASCII (c) && isalnum (c)) +#define ISALPHA(c) (ISASCII (c) && isalpha (c)) +#define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) +#define ISLOWER(c) (ISASCII (c) && islower (c)) +#define ISPUNCT(c) (ISASCII (c) && ispunct (c)) +#define ISSPACE(c) (ISASCII (c) && isspace (c)) +#define ISUPPER(c) (ISASCII (c) && isupper (c)) +#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) + +#ifdef _tolower +# define TOLOWER(c) _tolower(c) +#else +# define TOLOWER(c) tolower(c) +#endif + +#ifndef NULL +# define NULL (void *)0 +#endif + +/* We remove any previous definition of `SIGN_EXTEND_CHAR', + since ours (we hope) works properly with all combinations of + machines, compilers, `char' and `unsigned char' argument types. + (Per Bothner suggested the basic approach.) */ +#undef SIGN_EXTEND_CHAR +#if __STDC__ +# define SIGN_EXTEND_CHAR(c) ((signed char) (c)) +#else /* not __STDC__ */ +/* As in Harbison and Steele. */ +# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) +#endif + +#ifndef emacs +/* How many characters in the character set. */ +# define CHAR_SET_SIZE 256 + +# ifdef SYNTAX_TABLE + +extern char *re_syntax_table; + +# else /* not SYNTAX_TABLE */ + +static char re_syntax_table[CHAR_SET_SIZE]; + +static void init_syntax_once() +{ + register int c; + static int done = 0; + + if (done) + return; + bzero(re_syntax_table, sizeof re_syntax_table); + + for (c = 0; c < CHAR_SET_SIZE; ++c) + if (ISALNUM(c)) + re_syntax_table[c] = Sword; + + re_syntax_table['_'] = Sword; + + done = 1; +} + +# endif /* not SYNTAX_TABLE */ + +# define SYNTAX(c) re_syntax_table[((c) & 0xFF)] + +#endif /* emacs */ + +/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we + use `alloca' instead of `malloc'. This is because using malloc in + re_search* or re_match* could cause memory leaks when C-g is used in + Emacs; also, malloc is slower and causes storage fragmentation. On + the other hand, malloc is more portable, and easier to debug. + + Because we sometimes use alloca, some routines have to be macros, + not functions -- `alloca'-allocated space disappears at the end of the + function it is called in. */ + +#ifdef REGEX_MALLOC + +# define REGEX_ALLOCATE malloc +# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) +# define REGEX_FREE free + +#else /* not REGEX_MALLOC */ + +/* Emacs already defines alloca, sometimes. */ +# ifndef alloca + +/* Make alloca work the best possible way. */ +# ifdef __GNUC__ +# define alloca __builtin_alloca +# else /* not __GNUC__ */ +# if HAVE_ALLOCA_H +# include <alloca.h> +# endif /* HAVE_ALLOCA_H */ +# endif /* not __GNUC__ */ + +# endif /* not alloca */ + +# define REGEX_ALLOCATE alloca + +/* Assumes a `char *destination' variable. */ +# define REGEX_REALLOCATE(source, osize, nsize) \ + (destination = (char *) alloca (nsize), \ + memcpy (destination, source, osize)) + +/* No need to do anything to free, after alloca. */ +# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ + +#endif /* not REGEX_MALLOC */ + +/* Define how to allocate the failure stack. */ + +#if defined REL_ALLOC && defined REGEX_MALLOC + +# define REGEX_ALLOCATE_STACK(size) \ + r_alloc (&failure_stack_ptr, (size)) +# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ + r_re_alloc (&failure_stack_ptr, (nsize)) +# define REGEX_FREE_STACK(ptr) \ + r_alloc_free (&failure_stack_ptr) + +#else /* not using relocating allocator */ + +# ifdef REGEX_MALLOC + +# define REGEX_ALLOCATE_STACK malloc +# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) +# define REGEX_FREE_STACK free + +# else /* not REGEX_MALLOC */ + +# define REGEX_ALLOCATE_STACK alloca + +# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ + REGEX_REALLOCATE (source, osize, nsize) +/* No need to explicitly free anything. */ +# define REGEX_FREE_STACK(arg) + +# endif /* not REGEX_MALLOC */ +#endif /* not using relocating allocator */ + + +/* True if `size1' is non-NULL and PTR is pointing anywhere inside + `string1' or just past its end. This works if PTR is NULL, which is + a good thing. */ +#define FIRST_STRING_P(ptr) \ + (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) + +/* (Re)Allocate N items of type T using malloc, or fail. */ +#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) +#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) +#define RETALLOC_IF(addr, n, t) \ + if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) +#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) + +#define BYTEWIDTH 8 /* In bits. */ + +#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) + +#undef MAX +#undef MIN +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef char boolean; + +#define false 0 +#define true 1 + +static int re_match_2_internal PARAMS((struct re_pattern_buffer * bufp, + const char *string1, int size1, + const char *string2, int size2, + int pos, + struct re_registers * regs, + + int stop)); + +/* These are the command codes that appear in compiled regular + expressions. Some opcodes are followed by argument bytes. A + command code can specify any interpretation whatsoever for its + arguments. Zero bytes may appear in the compiled regular expression. */ + +typedef enum { + no_op = 0, + + /* Succeed right away--no more backtracking. */ + succeed, + + /* Followed by one byte giving n, then by n literal bytes. */ + exactn, + + /* Matches any (more or less) character. */ + anychar, + + /* Matches any one char belonging to specified set. First + following byte is number of bitmap bytes. Then come bytes + for a bitmap saying which chars are in. Bits in each byte + are ordered low-bit-first. A character is in the set if its + bit is 1. A character too large to have a bit in the map is + automatically not in the set. */ + charset, + + /* Same parameters as charset, but match any character that is + not one of those specified. */ + charset_not, + + /* Start remembering the text that is matched, for storing in a + register. Followed by one byte with the register number, in + the range 0 to one less than the pattern buffer's re_nsub + field. Then followed by one byte with the number of groups + inner to this one. (This last has to be part of the + start_memory only because we need it in the on_failure_jump + of re_match_2.) */ + start_memory, + + /* Stop remembering the text that is matched and store it in a + memory register. Followed by one byte with the register + number, in the range 0 to one less than `re_nsub' in the + pattern buffer, and one byte with the number of inner groups, + just like `start_memory'. (We need the number of inner + groups here because we don't have any easy way of finding the + corresponding start_memory when we're at a stop_memory.) */ + stop_memory, + + /* Match a duplicate of something remembered. Followed by one + byte containing the register number. */ + duplicate, + + /* Fail unless at beginning of line. */ + begline, + + /* Fail unless at end of line. */ + endline, + + /* Succeeds if at beginning of buffer (if emacs) or at beginning + of string to be matched (if not). */ + begbuf, + + /* Analogously, for end of buffer/string. */ + endbuf, + + /* Followed by two byte relative address to which to jump. */ + jump, + + /* Same as jump, but marks the end of an alternative. */ + jump_past_alt, + + /* Followed by two-byte relative address of place to resume at + in case of failure. */ + on_failure_jump, + + /* Like on_failure_jump, but pushes a placeholder instead of the + current string position when executed. */ + on_failure_keep_string_jump, + + /* Throw away latest failure point and then jump to following + two-byte relative address. */ + pop_failure_jump, + + /* Change to pop_failure_jump if know won't have to backtrack to + match; otherwise change to jump. This is used to jump + back to the beginning of a repeat. If what follows this jump + clearly won't match what the repeat does, such that we can be + sure that there is no use backtracking out of repetitions + already matched, then we change it to a pop_failure_jump. + Followed by two-byte address. */ + maybe_pop_jump, + + /* Jump to following two-byte address, and push a dummy failure + point. This failure point will be thrown away if an attempt + is made to use it for a failure. A `+' construct makes this + before the first repeat. Also used as an intermediary kind + of jump when compiling an alternative. */ + dummy_failure_jump, + + /* Push a dummy failure point and continue. Used at the end of + alternatives. */ + push_dummy_failure, + + /* Followed by two-byte relative address and two-byte number n. + After matching N times, jump to the address upon failure. */ + succeed_n, + + /* Followed by two-byte relative address, and two-byte number n. + Jump to the address N times, then fail. */ + jump_n, + + /* Set the following two-byte relative address to the + subsequent two-byte number. The address *includes* the two + bytes of number. */ + set_number_at, + + wordchar, /* Matches any word-constituent character. */ + notwordchar, /* Matches any char that is not a word-constituent. */ + + wordbeg, /* Succeeds if at word beginning. */ + wordend, /* Succeeds if at word end. */ + + wordbound, /* Succeeds if at a word boundary. */ + notwordbound /* Succeeds if not at a word boundary. */ +#ifdef emacs + , before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ + + /* Matches any character whose syntax is specified. Followed by + a byte which contains a syntax code, e.g., Sword. */ + syntaxspec, + + /* Matches any character whose syntax is not that specified. */ + notsyntaxspec +#endif /* emacs */ +} re_opcode_t; + +/* Common operations on the compiled pattern. */ + +/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ + +#define STORE_NUMBER(destination, number) \ + do { \ + (destination)[0] = (number) & 0377; \ + (destination)[1] = (number) >> 8; \ + } while (0) + +/* Same as STORE_NUMBER, except increment DESTINATION to + the byte after where the number is stored. Therefore, DESTINATION + must be an lvalue. */ + +#define STORE_NUMBER_AND_INCR(destination, number) \ + do { \ + STORE_NUMBER (destination, number); \ + (destination) += 2; \ + } while (0) + +/* Put into DESTINATION a number stored in two contiguous bytes starting + at SOURCE. */ + +#define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source) & 0377; \ + (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ + } while (0) + +#ifdef DEBUG +static void extract_number _RE_ARGS((int *dest, unsigned char *source)); +static void extract_number(dest, source) +int *dest; +unsigned char *source; +{ + int temp = SIGN_EXTEND_CHAR(*(source + 1)); + + *dest = *source & 0377; + *dest += temp << 8; +} + +# ifndef EXTRACT_MACROS /* To debug the macros. */ +# undef EXTRACT_NUMBER +# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) +# endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. + SOURCE must be an lvalue. */ + +#define EXTRACT_NUMBER_AND_INCR(destination, source) \ + do { \ + EXTRACT_NUMBER (destination, source); \ + (source) += 2; \ + } while (0) + +#ifdef DEBUG +static void extract_number_and_incr _RE_ARGS((int *destination, + unsigned char **source)); +static void extract_number_and_incr(destination, source) +int *destination; +unsigned char **source; +{ + extract_number(destination, *source); + *source += 2; +} + +# ifndef EXTRACT_MACROS +# undef EXTRACT_NUMBER_AND_INCR +# define EXTRACT_NUMBER_AND_INCR(dest, src) \ + extract_number_and_incr (&dest, &src) +# endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* If DEBUG is defined, Regex prints many voluminous messages about what + it is doing (if the variable `debug' is nonzero). If linked with the + main program in `iregex.c', you can enter patterns and strings + interactively. And if linked with the main program in `main.c' and + the other test files, you can run the already-written tests. */ + +#ifdef DEBUG + +/* We use standard I/O for debugging. */ +# include <stdio.h> + +/* It is useful to test things that ``must'' be true when debugging. */ +# include <assert.h> + +static int debug; + +# define DEBUG_STATEMENT(e) e +# define DEBUG_PRINT1(x) if (debug) printf (x) +# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) +# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) +# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) +# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (debug) print_partial_compiled_pattern (s, e) +# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (debug) print_double_string (w, s1, sz1, s2, sz2) + + +/* Print the fastmap in human-readable form. */ + +void print_fastmap(fastmap) +char *fastmap; +{ + unsigned was_a_range = 0; + unsigned i = 0; + + while (i < (1 << BYTEWIDTH)) { + if (fastmap[i++]) { + was_a_range = 0; + putchar(i - 1); + while (i < (1 << BYTEWIDTH) && fastmap[i]) { + was_a_range = 1; + i++; + } + if (was_a_range) { + printf("-"); + putchar(i - 1); + } + } + } + putchar('\n'); +} + + +/* Print a compiled pattern string in human-readable form, starting at + the START pointer into it and ending just before the pointer END. */ + +void print_partial_compiled_pattern(start, end) +unsigned char *start; +unsigned char *end; +{ + int mcnt, mcnt2; + unsigned char *p1; + unsigned char *p = start; + unsigned char *pend = end; + + if (start == NULL) { + printf("(null)\n"); + return; + } + + /* Loop over pattern commands. */ + while (p < pend) { + printf("%d:\t", p - start); + + switch ((re_opcode_t) * p++) { + case no_op: + printf("/no_op"); + break; + + case exactn: + mcnt = *p++; + printf("/exactn/%d", mcnt); + do { + putchar('/'); + putchar(*p++); + } + while (--mcnt); + break; + + case start_memory: + mcnt = *p++; + printf("/start_memory/%d/%d", mcnt, *p++); + break; + + case stop_memory: + mcnt = *p++; + printf("/stop_memory/%d/%d", mcnt, *p++); + break; + + case duplicate: + printf("/duplicate/%d", *p++); + break; + + case anychar: + printf("/anychar"); + break; + + case charset: + case charset_not: + { + register int c, last = -100; + register int in_range = 0; + + printf("/charset [%s", + (re_opcode_t) * (p - 1) == charset_not ? "^" : ""); + + assert(p + *p < pend); + + for (c = 0; c < 256; c++) + if (c / 8 < *p && (p[1 + (c / 8)] & (1 << (c % 8)))) { + /* Are we starting a range? */ + if (last + 1 == c && !in_range) { + putchar('-'); + in_range = 1; + } + /* Have we broken a range? */ + else if (last + 1 != c && in_range) { + putchar(last); + in_range = 0; + } + + if (!in_range) + putchar(c); + + last = c; + } + + if (in_range) + putchar(last); + + putchar(']'); + + p += 1 + *p; + } + break; + + case begline: + printf("/begline"); + break; + + case endline: + printf("/endline"); + break; + + case on_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/on_failure_jump to %d", p + mcnt - start); + break; + + case on_failure_keep_string_jump: + extract_number_and_incr(&mcnt, &p); + printf("/on_failure_keep_string_jump to %d", p + mcnt - start); + break; + + case dummy_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/dummy_failure_jump to %d", p + mcnt - start); + break; + + case push_dummy_failure: + printf("/push_dummy_failure"); + break; + + case maybe_pop_jump: + extract_number_and_incr(&mcnt, &p); + printf("/maybe_pop_jump to %d", p + mcnt - start); + break; + + case pop_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/pop_failure_jump to %d", p + mcnt - start); + break; + + case jump_past_alt: + extract_number_and_incr(&mcnt, &p); + printf("/jump_past_alt to %d", p + mcnt - start); + break; + + case jump: + extract_number_and_incr(&mcnt, &p); + printf("/jump to %d", p + mcnt - start); + break; + + case succeed_n: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/succeed_n to %d, %d times", p1 - start, mcnt2); + break; + + case jump_n: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/jump_n to %d, %d times", p1 - start, mcnt2); + break; + + case set_number_at: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/set_number_at location %d to %d", p1 - start, mcnt2); + break; + + case wordbound: + printf("/wordbound"); + break; + + case notwordbound: + printf("/notwordbound"); + break; + + case wordbeg: + printf("/wordbeg"); + break; + + case wordend: + printf("/wordend"); + +# ifdef emacs + case before_dot: + printf("/before_dot"); + break; + + case at_dot: + printf("/at_dot"); + break; + + case after_dot: + printf("/after_dot"); + break; + + case syntaxspec: + printf("/syntaxspec"); + mcnt = *p++; + printf("/%d", mcnt); + break; + + case notsyntaxspec: + printf("/notsyntaxspec"); + mcnt = *p++; + printf("/%d", mcnt); + break; +# endif /* emacs */ + + case wordchar: + printf("/wordchar"); + break; + + case notwordchar: + printf("/notwordchar"); + break; + + case begbuf: + printf("/begbuf"); + break; + + case endbuf: + printf("/endbuf"); + break; + + default: + printf("?%d", *(p - 1)); + } + + putchar('\n'); + } + + printf("%d:\tend of pattern.\n", p - start); +} + + +void print_compiled_pattern(bufp) +struct re_pattern_buffer *bufp; +{ + unsigned char *buffer = bufp->buffer; + + print_partial_compiled_pattern(buffer, buffer + bufp->used); + printf("%ld bytes used/%ld bytes allocated.\n", + bufp->used, bufp->allocated); + + if (bufp->fastmap_accurate && bufp->fastmap) { + printf("fastmap: "); + print_fastmap(bufp->fastmap); + } + + printf("re_nsub: %d\t", bufp->re_nsub); + printf("regs_alloc: %d\t", bufp->regs_allocated); + printf("can_be_null: %d\t", bufp->can_be_null); + printf("newline_anchor: %d\n", bufp->newline_anchor); + printf("no_sub: %d\t", bufp->no_sub); + printf("not_bol: %d\t", bufp->not_bol); + printf("not_eol: %d\t", bufp->not_eol); + printf("syntax: %lx\n", bufp->syntax); + /* Perhaps we should print the translate table? */ +} + + +void print_double_string(where, string1, size1, string2, size2) +const char *where; +const char *string1; +const char *string2; +int size1; +int size2; +{ + int this_char; + + if (where == NULL) + printf("(null)"); + else { + if (FIRST_STRING_P(where)) { + for (this_char = where - string1; this_char < size1; + this_char++) + putchar(string1[this_char]); + + where = string2; + } + + for (this_char = where - string2; this_char < size2; this_char++) + putchar(string2[this_char]); + } +} + +void printchar(c) +int c; +{ + putc(c, stderr); +} + +#else /* not DEBUG */ + +# undef assert +# define assert(e) + +# define DEBUG_STATEMENT(e) +# define DEBUG_PRINT1(x) +# define DEBUG_PRINT2(x1, x2) +# define DEBUG_PRINT3(x1, x2, x3) +# define DEBUG_PRINT4(x1, x2, x3, x4) +# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) +# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) + +#endif /* not DEBUG */ + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +/* This has no initializer because initialized variables in Emacs + become read-only after dumping. */ +reg_syntax_t re_syntax_options; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t re_set_syntax(syntax) +reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; +#ifdef DEBUG + if (syntax & RE_DEBUG) + debug = 1; + else if (debug) /* was on but now is not */ + debug = 0; +#endif /* DEBUG */ + return ret; +} + +#ifdef _LIBC +weak_alias(__re_set_syntax, re_set_syntax) +#endif +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. + POSIX doesn't require that we do anything for REG_NOERROR, + but why not be nice? */ +static const char re_error_msgid[] = { +#define REG_NOERROR_IDX 0 + gettext_noop("Success") /* REG_NOERROR */ + "\0" +#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") + gettext_noop("No match") /* REG_NOMATCH */ + "\0" +#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") + gettext_noop("Invalid regular expression") /* REG_BADPAT */ + "\0" +#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") + gettext_noop("Invalid collation character") /* REG_ECOLLATE */ + "\0" +#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") + gettext_noop("Invalid character class name") /* REG_ECTYPE */ + "\0" +#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") + gettext_noop("Trailing backslash") /* REG_EESCAPE */ + "\0" +#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") + gettext_noop("Invalid back reference") /* REG_ESUBREG */ + "\0" +#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") + gettext_noop("Unmatched [ or [^") /* REG_EBRACK */ + "\0" +#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") + gettext_noop("Unmatched ( or \\(") /* REG_EPAREN */ + "\0" +#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") + gettext_noop("Unmatched \\{") /* REG_EBRACE */ + "\0" +#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") + gettext_noop("Invalid content of \\{\\}") /* REG_BADBR */ + "\0" +#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") + gettext_noop("Invalid range end") /* REG_ERANGE */ + "\0" +#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") + gettext_noop("Memory exhausted") /* REG_ESPACE */ + "\0" +#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") + gettext_noop("Invalid preceding regular expression") /* REG_BADRPT */ + "\0" +#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") + gettext_noop("Premature end of regular expression") /* REG_EEND */ + "\0" +#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") + gettext_noop("Regular expression too big") /* REG_ESIZE */ + "\0" +#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") + gettext_noop("Unmatched ) or \\)") /* REG_ERPAREN */ +}; + +static const size_t re_error_msgid_idx[] = { + REG_NOERROR_IDX, + REG_NOMATCH_IDX, + REG_BADPAT_IDX, + REG_ECOLLATE_IDX, + REG_ECTYPE_IDX, + REG_EESCAPE_IDX, + REG_ESUBREG_IDX, + REG_EBRACK_IDX, + REG_EPAREN_IDX, + REG_EBRACE_IDX, + REG_BADBR_IDX, + REG_ERANGE_IDX, + REG_ESPACE_IDX, + REG_BADRPT_IDX, + REG_EEND_IDX, + REG_ESIZE_IDX, + REG_ERPAREN_IDX +}; + +/* Avoiding alloca during matching, to placate r_alloc. */ + +/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the + searching and matching functions should not call alloca. On some + systems, alloca is implemented in terms of malloc, and if we're + using the relocating allocator routines, then malloc could cause a + relocation, which might (if the strings being searched are in the + ralloc heap) shift the data out from underneath the regexp + routines. + + Here's another reason to avoid allocation: Emacs + processes input from X in a signal handler; processing X input may + call malloc; if input arrives while a matching routine is calling + malloc, then we're scrod. But Emacs can't just block input while + calling matching routines; then we don't notice interrupts when + they come in. So, Emacs blocks input around all regexp calls + except the matching calls, which it leaves unprotected, in the + faith that they will not malloc. */ + +/* Normally, this is fine. */ +#define MATCH_MAY_ALLOCATE + +/* When using GNU C, we are not REALLY using the C alloca, no matter + what config.h may say. So don't take precautions for it. */ +#ifdef __GNUC__ +# undef C_ALLOCA +#endif + +/* The match routines may not allocate if (1) they would do it with malloc + and (2) it's not safe for them to use malloc. + Note that if REL_ALLOC is defined, matching would not use malloc for the + failure stack, but we would still use it for the register vectors; + so REL_ALLOC should not affect this. */ +#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs +# undef MATCH_MAY_ALLOCATE +#endif + + +/* Failure stack declarations and macros; both re_compile_fastmap and + re_match_2 use a failure stack. These have to be macros because of + REGEX_ALLOCATE_STACK. */ + + +/* Number of failure points for which to initially allocate space + when matching. If this number is exceeded, we allocate more + space, so it is not a hard limit. */ +#ifndef INIT_FAILURE_ALLOC +# define INIT_FAILURE_ALLOC 5 +#endif + +/* Roughly the maximum number of failure points on the stack. Would be + exactly that if always used MAX_FAILURE_ITEMS items each time we failed. + This is a variable only so users of regex can assign to it; we never + change it ourselves. */ + +#ifdef INT_IS_16BIT + +# if defined MATCH_MAY_ALLOCATE +/* 4400 was enough to cause a crash on Alpha OSF/1, + whose default stack limit is 2mb. */ +long int re_max_failures = 4000; +# else +long int re_max_failures = 2000; +# endif + +union fail_stack_elt { + unsigned char *pointer; + long int integer; +}; + +typedef union fail_stack_elt fail_stack_elt_t; + +typedef struct { + fail_stack_elt_t *stack; + unsigned long int size; + unsigned long int avail; /* Offset of next open position. */ +} fail_stack_type; + +#else /* not INT_IS_16BIT */ + +# if defined MATCH_MAY_ALLOCATE +/* 4400 was enough to cause a crash on Alpha OSF/1, + whose default stack limit is 2mb. */ +int re_max_failures = 20000; +# else +int re_max_failures = 2000; +# endif + +union fail_stack_elt { + unsigned char *pointer; + int integer; +}; + +typedef union fail_stack_elt fail_stack_elt_t; + +typedef struct { + fail_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} fail_stack_type; + +#endif /* INT_IS_16BIT */ + +#define FAIL_STACK_EMPTY() (fail_stack.avail == 0) +#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) +#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) + + +/* Define macros to initialize and free the failure stack. + Do `return -2' if the alloc fails. */ + +#ifdef MATCH_MAY_ALLOCATE +# define INIT_FAIL_STACK() \ + do { \ + fail_stack.stack = (fail_stack_elt_t *) \ + REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + \ + if (fail_stack.stack == NULL) \ + return -2; \ + \ + fail_stack.size = INIT_FAILURE_ALLOC; \ + fail_stack.avail = 0; \ + } while (0) + +# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) +#else +# define INIT_FAIL_STACK() \ + do { \ + fail_stack.avail = 0; \ + } while (0) + +# define RESET_FAIL_STACK() +#endif + + +/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. + + Return 1 if succeeds, and 0 if either ran out of memory + allocating space for it or it was already too large. + + REGEX_REALLOCATE_STACK requires `destination' be declared. */ + +#define DOUBLE_FAIL_STACK(fail_stack) \ + ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ + ? 0 \ + : ((fail_stack).stack = (fail_stack_elt_t *) \ + REGEX_REALLOCATE_STACK ((fail_stack).stack, \ + (fail_stack).size * sizeof (fail_stack_elt_t), \ + ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + \ + (fail_stack).stack == NULL \ + ? 0 \ + : ((fail_stack).size <<= 1, \ + 1))) + + +/* Push pointer POINTER on FAIL_STACK. + Return 1 if was able to do so and 0 if ran out of memory allocating + space to do so. */ +#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ + ((FAIL_STACK_FULL () \ + && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ + ? 0 \ + : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ + 1)) + +/* Push a pointer value onto the failure stack. + Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_POINTER(item) \ + fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) + +/* This pushes an integer-valued item onto the failure stack. + Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_INT(item) \ + fail_stack.stack[fail_stack.avail++].integer = (item) + +/* Push a fail_stack_elt_t value onto the failure stack. + Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_ELT(item) \ + fail_stack.stack[fail_stack.avail++] = (item) + +/* These three POP... operations complement the three PUSH... operations. + All assume that `fail_stack' is nonempty. */ +#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer +#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer +#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] + +/* Used to omit pushing failure point id's when we're not debugging. */ +#ifdef DEBUG +# define DEBUG_PUSH PUSH_FAILURE_INT +# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () +#else +# define DEBUG_PUSH(item) +# define DEBUG_POP(item_addr) +#endif + + +/* Push the information about the state we will need + if we ever fail back to it. + + Requires variables fail_stack, regstart, regend, reg_info, and + num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination' + be declared. + + Does `return FAILURE_CODE' if runs out of memory. */ + +#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ + do { \ + char *destination; \ + /* Must be int, so when we don't save any registers, the arithmetic \ + of 0 + -1 isn't done as unsigned. */ \ + /* Can't be int, since there is not a shred of a guarantee that int \ + is wide enough to hold a value of something to which pointer can \ + be assigned */ \ + active_reg_t this_reg; \ + \ + DEBUG_STATEMENT (failure_id++); \ + DEBUG_STATEMENT (nfailure_points_pushed++); \ + DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ + DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ + DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ + \ + DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \ + DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ + \ + /* Ensure we have enough space allocated for what we will push. */ \ + while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ + { \ + if (!DOUBLE_FAIL_STACK (fail_stack)) \ + return failure_code; \ + \ + DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ + (fail_stack).size); \ + DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ + } \ + \ + /* Push the info, starting with the registers. */ \ + DEBUG_PRINT1 ("\n"); \ + \ + if (1) \ + for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ + this_reg++) \ + { \ + DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \ + DEBUG_STATEMENT (num_regs_pushed++); \ + \ + DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ + PUSH_FAILURE_POINTER (regstart[this_reg]); \ + \ + DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ + PUSH_FAILURE_POINTER (regend[this_reg]); \ + \ + DEBUG_PRINT2 (" info: %p\n ", \ + reg_info[this_reg].word.pointer); \ + DEBUG_PRINT2 (" match_null=%d", \ + REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ + DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ + DEBUG_PRINT2 (" matched_something=%d", \ + MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT2 (" ever_matched=%d", \ + EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT1 ("\n"); \ + PUSH_FAILURE_ELT (reg_info[this_reg].word); \ + } \ + \ + DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\ + PUSH_FAILURE_INT (lowest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\ + PUSH_FAILURE_INT (highest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ + PUSH_FAILURE_POINTER (pattern_place); \ + \ + DEBUG_PRINT2 (" Pushing string %p: `", string_place); \ + DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ + size2); \ + DEBUG_PRINT1 ("'\n"); \ + PUSH_FAILURE_POINTER (string_place); \ + \ + DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ + DEBUG_PUSH (failure_id); \ + } while (0) + +/* This is the number of items that are pushed and popped on the stack + for each register. */ +#define NUM_REG_ITEMS 3 + +/* Individual items aside from the registers. */ +#ifdef DEBUG +# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ +#else +# define NUM_NONREG_ITEMS 4 +#endif + +/* We push at most this many items on the stack. */ +/* We used to use (num_regs - 1), which is the number of registers + this regexp will save; but that was changed to 5 + to avoid stack overflow for a regexp with lots of parens. */ +#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) + +/* We actually push this many items. */ +#define NUM_FAILURE_ITEMS \ + (((0 \ + ? 0 : highest_active_reg - lowest_active_reg + 1) \ + * NUM_REG_ITEMS) \ + + NUM_NONREG_ITEMS) + +/* How many items can still be added to the stack without overflowing it. */ +#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) + + +/* Pops what PUSH_FAIL_STACK pushes. + + We restore into the parameters, all of which should be lvalues: + STR -- the saved data position. + PAT -- the saved pattern position. + LOW_REG, HIGH_REG -- the highest and lowest active registers. + REGSTART, REGEND -- arrays of string positions. + REG_INFO -- array of information about each subexpression. + + Also assumes the variables `fail_stack' and (if debugging), `bufp', + `pend', `string1', `size1', `string2', and `size2'. */ + +#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ +{ \ + DEBUG_STATEMENT (unsigned failure_id;) \ + active_reg_t this_reg; \ + const unsigned char *string_temp; \ + \ + assert (!FAIL_STACK_EMPTY ()); \ + \ + /* Remove failure points and point to how many regs pushed. */ \ + DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ + DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ + DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ + \ + assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ + \ + DEBUG_POP (&failure_id); \ + DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ + \ + /* If the saved string location is NULL, it came from an \ + on_failure_keep_string_jump opcode, and we want to throw away the \ + saved NULL, thus retaining our current position in the string. */ \ + string_temp = POP_FAILURE_POINTER (); \ + if (string_temp != NULL) \ + str = (const char *) string_temp; \ + \ + DEBUG_PRINT2 (" Popping string %p: `", str); \ + DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ + DEBUG_PRINT1 ("'\n"); \ + \ + pat = (unsigned char *) POP_FAILURE_POINTER (); \ + DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ + \ + /* Restore register info. */ \ + high_reg = (active_reg_t) POP_FAILURE_INT (); \ + DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \ + \ + low_reg = (active_reg_t) POP_FAILURE_INT (); \ + DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \ + \ + if (1) \ + for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ + { \ + DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \ + \ + reg_info[this_reg].word = POP_FAILURE_ELT (); \ + DEBUG_PRINT2 (" info: %p\n", \ + reg_info[this_reg].word.pointer); \ + \ + regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ + DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ + \ + regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ + DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ + } \ + else \ + { \ + for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ + { \ + reg_info[this_reg].word.integer = 0; \ + regend[this_reg] = 0; \ + regstart[this_reg] = 0; \ + } \ + highest_active_reg = high_reg; \ + } \ + \ + set_regs_matched_done = 0; \ + DEBUG_STATEMENT (nfailure_points_popped++); \ +} /* POP_FAILURE_POINT */ + + + +/* Structure for per-register (a.k.a. per-group) information. + Other register information, such as the + starting and ending positions (which are addresses), and the list of + inner groups (which is a bits list) are maintained in separate + variables. + + We are making a (strictly speaking) nonportable assumption here: that + the compiler will pack our bit fields into something that fits into + the type of `word', i.e., is something that fits into one item on the + failure stack. */ + + +/* Declarations and macros for re_match_2. */ + +typedef union { + fail_stack_elt_t word; + struct { + /* This field is one if this group can match the empty string, + zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ +#define MATCH_NULL_UNSET_VALUE 3 + unsigned match_null_string_p:2; + unsigned is_active:1; + unsigned matched_something:1; + unsigned ever_matched_something:1; + } bits; +} register_info_type; + +#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) +#define IS_ACTIVE(R) ((R).bits.is_active) +#define MATCHED_SOMETHING(R) ((R).bits.matched_something) +#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) + + +/* Call this when have matched a real character; it sets `matched' flags + for the subexpressions which we are currently inside. Also records + that those subexprs have matched. */ +#define SET_REGS_MATCHED() \ + do \ + { \ + if (!set_regs_matched_done) \ + { \ + active_reg_t r; \ + set_regs_matched_done = 1; \ + for (r = lowest_active_reg; r <= highest_active_reg; r++) \ + { \ + MATCHED_SOMETHING (reg_info[r]) \ + = EVER_MATCHED_SOMETHING (reg_info[r]) \ + = 1; \ + } \ + } \ + } \ + while (0) + +/* Registers are set to a sentinel when they haven't yet matched. */ +static char reg_unset_dummy; + +#define REG_UNSET_VALUE (®_unset_dummy) +#define REG_UNSET(e) ((e) == REG_UNSET_VALUE) + +/* Subroutine declarations and macros for regex_compile. */ + +static reg_errcode_t regex_compile +_RE_ARGS( + (const char *pattern, size_t size, reg_syntax_t syntax, + struct re_pattern_buffer * bufp)); +static void store_op1 + +_RE_ARGS((re_opcode_t op, unsigned char *loc, int arg)); +static void store_op2 +_RE_ARGS((re_opcode_t op, unsigned char *loc, int arg1, int arg2)); +static void insert_op1 +_RE_ARGS( + + (re_opcode_t op, unsigned char *loc, int arg, + unsigned char *end)); +static void insert_op2 +_RE_ARGS( + (re_opcode_t op, unsigned char *loc, int arg1, int arg2, + + unsigned char *end)); +static boolean at_begline_loc_p +_RE_ARGS((const char *pattern, const char *p, reg_syntax_t syntax)); +static boolean at_endline_loc_p +_RE_ARGS((const char *p, const char *pend, reg_syntax_t syntax)); +static reg_errcode_t compile_range +_RE_ARGS( + (const char **p_ptr, const char *pend, char *translate, + reg_syntax_t syntax, unsigned char *b)); + +/* Fetch the next character in the uncompiled pattern---translating it + if necessary. Also cast from a signed character in the constant + string passed to us by the user to an unsigned char that we can use + as an array index (in, e.g., `translate'). */ +#ifndef PATFETCH +# define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + if (translate) c = (unsigned char) translate[c]; \ + } while (0) +#endif + +/* Fetch the next character in the uncompiled pattern, with no + translation. */ +#define PATFETCH_RAW(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + } while (0) + +/* Go backwards one character in the pattern. */ +#define PATUNFETCH p-- + + +/* If `translate' is non-null, return translate[D], else just D. We + cast the subscript to translate because some data is declared as + `char *', to avoid warnings when a string constant is passed. But + when we use a character as a subscript we must make it unsigned. */ +#ifndef TRANSLATE +# define TRANSLATE(d) \ + (translate ? (char) translate[(unsigned char) (d)] : (d)) +#endif + + +/* Macros for outputting the compiled pattern into `buffer'. */ + +/* If the buffer isn't allocated when it comes in, use this. */ +#define INIT_BUF_SIZE 32 + +/* Make sure we have at least N more bytes of space in buffer. */ +#define GET_BUFFER_SPACE(n) \ + while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ + EXTEND_BUFFER () + +/* Make sure we have one more byte of buffer space and then add C to it. */ +#define BUF_PUSH(c) \ + do { \ + GET_BUFFER_SPACE (1); \ + *b++ = (unsigned char) (c); \ + } while (0) + + +/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ +#define BUF_PUSH_2(c1, c2) \ + do { \ + GET_BUFFER_SPACE (2); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + } while (0) + + +/* As with BUF_PUSH_2, except for three bytes. */ +#define BUF_PUSH_3(c1, c2, c3) \ + do { \ + GET_BUFFER_SPACE (3); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + *b++ = (unsigned char) (c3); \ + } while (0) + + +/* Store a jump with opcode OP at LOC to location TO. We store a + relative address offset by the three bytes the jump itself occupies. */ +#define STORE_JUMP(op, loc, to) \ + store_op1 (op, loc, (int) ((to) - (loc) - 3)) + +/* Likewise, for a two-argument jump. */ +#define STORE_JUMP2(op, loc, to, arg) \ + store_op2 (op, loc, (int) ((to) - (loc) - 3), arg) + +/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP(op, loc, to) \ + insert_op1 (op, loc, (int) ((to) - (loc) - 3), b) + +/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP2(op, loc, to, arg) \ + insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b) + + +/* This is not an arbitrary limit: the arguments which represent offsets + into the pattern are two bytes long. So if 2^16 bytes turns out to + be too small, many things would have to change. */ +/* Any other compiler which, like MSC, has allocation limit below 2^16 + bytes will have to use approach similar to what was done below for + MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up + reallocating to 0 bytes. Such thing is not going to work too well. + You have been warned!! */ +#if defined _MSC_VER && !defined WIN32 +/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. + The REALLOC define eliminates a flurry of conversion warnings, + but is not required. */ +# define MAX_BUF_SIZE 65500L +# define REALLOC(p,s) realloc ((p), (size_t) (s)) +#else +# define MAX_BUF_SIZE (1L << 16) +# define REALLOC(p,s) realloc ((p), (s)) +#endif + +/* Extend the buffer by twice its current size via realloc and + reset the pointers that pointed into the old block to point to the + correct places in the new one. If extending the buffer results in it + being larger than MAX_BUF_SIZE, then flag memory exhausted. */ +#define EXTEND_BUFFER() \ + do { \ + unsigned char *old_buffer = bufp->buffer; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\ + if (bufp->buffer == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != bufp->buffer) \ + { \ + b = (b - old_buffer) + bufp->buffer; \ + begalt = (begalt - old_buffer) + bufp->buffer; \ + if (fixup_alt_jump) \ + fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ + if (laststart) \ + laststart = (laststart - old_buffer) + bufp->buffer; \ + if (pending_exact) \ + pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ + } \ + } while (0) + + +/* Since we have one byte reserved for the register number argument to + {start,stop}_memory, the maximum number of groups we can report + things about is what fits in that byte. */ +#define MAX_REGNUM 255 + +/* But patterns can have more than `MAX_REGNUM' registers. We just + ignore the excess. */ +typedef unsigned regnum_t; + + +/* Macros for the compile stack. */ + +/* Since offsets can go either forwards or backwards, this type needs to + be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ +/* int may be not enough when sizeof(int) == 2. */ +typedef long pattern_offset_t; + +typedef struct { + pattern_offset_t begalt_offset; + pattern_offset_t fixup_alt_jump; + pattern_offset_t inner_group_offset; + pattern_offset_t laststart_offset; + regnum_t regnum; +} compile_stack_elt_t; + + +typedef struct { + compile_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} compile_stack_type; + + +#define INIT_COMPILE_STACK_SIZE 32 + +#define COMPILE_STACK_EMPTY (compile_stack.avail == 0) +#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) + +/* The next available element. */ +#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) + + +/* Set the bit for character C in a list. */ +#define SET_LIST_BIT(c) \ + (b[((unsigned char) (c)) / BYTEWIDTH] \ + |= 1 << (((unsigned char) c) % BYTEWIDTH)) + + +/* Get the next unsigned number in the uncompiled pattern. */ +#define GET_UNSIGNED_NUMBER(num) \ + { if (p != pend) \ + { \ + PATFETCH (c); \ + while ('0' <= c && c <= '9') \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + if (p == pend) \ + break; \ + PATFETCH (c); \ + } \ + } \ + } + +#if defined _LIBC || WIDE_CHAR_SUPPORT +/* The GNU C library provides support for user-defined character classes + and the functions from ISO C amendement 1. */ +# ifdef CHARCLASS_NAME_MAX +# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX +# else +/* This shouldn't happen but some implementation might still have this + problem. Use a reasonable default value. */ +# define CHAR_CLASS_MAX_LENGTH 256 +# endif + +# ifdef _LIBC +# define IS_CHAR_CLASS(string) __wctype (string) +# else +# define IS_CHAR_CLASS(string) wctype (string) +# endif +#else +# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ + +# define IS_CHAR_CLASS(string) \ + (STREQ (string, "alpha") || STREQ (string, "upper") \ + || STREQ (string, "lower") || STREQ (string, "digit") \ + || STREQ (string, "alnum") || STREQ (string, "xdigit") \ + || STREQ (string, "space") || STREQ (string, "print") \ + || STREQ (string, "punct") || STREQ (string, "graph") \ + || STREQ (string, "cntrl") || STREQ (string, "blank")) +#endif + +#ifndef MATCH_MAY_ALLOCATE + +/* If we cannot allocate large objects within re_match_2_internal, + we make the fail stack and register vectors global. + The fail stack, we grow to the maximum size when a regexp + is compiled. + The register vectors, we adjust in size each time we + compile a regexp, according to the number of registers it needs. */ + +static fail_stack_type fail_stack; + +/* Size with which the following vectors are currently allocated. + That is so we can make them bigger as needed, + but never make them smaller. */ +static int regs_allocated_size; + +static const char **regstart, **regend; +static const char **old_regstart, **old_regend; +static const char **best_regstart, **best_regend; +static register_info_type *reg_info; +static const char **reg_dummy; +static register_info_type *reg_info_dummy; + +/* Make the register vectors big enough for NUM_REGS registers, + but don't make them smaller. */ + +static regex_grow_registers(num_regs) +int num_regs; +{ + if (num_regs > regs_allocated_size) { + RETALLOC_IF(regstart, num_regs, const char *); + RETALLOC_IF(regend, num_regs, const char *); + RETALLOC_IF(old_regstart, num_regs, const char *); + RETALLOC_IF(old_regend, num_regs, const char *); + RETALLOC_IF(best_regstart, num_regs, const char *); + RETALLOC_IF(best_regend, num_regs, const char *); + + RETALLOC_IF(reg_info, num_regs, register_info_type); + RETALLOC_IF(reg_dummy, num_regs, const char *); + + RETALLOC_IF(reg_info_dummy, num_regs, register_info_type); + + regs_allocated_size = num_regs; + } +} + +#endif /* not MATCH_MAY_ALLOCATE */ + +static boolean group_in_compile_stack _RE_ARGS((compile_stack_type + compile_stack, + + regnum_t regnum)); + +/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. + Returns one of error codes defined in `regex.h', or zero for success. + + Assumes the `allocated' (and perhaps `buffer') and `translate' + fields are set in BUFP on entry. + + If it succeeds, results are put in BUFP (if it returns an error, the + contents of BUFP are undefined): + `buffer' is the compiled pattern; + `syntax' is set to SYNTAX; + `used' is set to the length of the compiled pattern; + `fastmap_accurate' is zero; + `re_nsub' is the number of subexpressions in PATTERN; + `not_bol' and `not_eol' are zero; + + The `fastmap' and `newline_anchor' fields are neither + examined nor set. */ + +/* Return, freeing storage we allocated. */ +#define FREE_STACK_RETURN(value) \ + return (free (compile_stack.stack), value) + +static reg_errcode_t regex_compile(pattern, size, syntax, bufp) +const char *pattern; +size_t size; +reg_syntax_t syntax; +struct re_pattern_buffer *bufp; +{ + /* We fetch characters from PATTERN here. Even though PATTERN is + `char *' (i.e., signed), we declare these variables as unsigned, so + they can be reliably used as array indices. */ + register unsigned char c, c1; + + /* A random temporary spot in PATTERN. */ + const char *p1; + + /* Points to the end of the buffer, where we should append. */ + register unsigned char *b; + + /* Keeps track of unclosed groups. */ + compile_stack_type compile_stack; + + /* Points to the current (ending) position in the pattern. */ + const char *p = pattern; + const char *pend = pattern + size; + + /* How to translate the characters in the pattern. */ + RE_TRANSLATE_TYPE translate = bufp->translate; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell if a new exact-match + character can be added to that command or if the character requires + a new `exactn' command. */ + unsigned char *pending_exact = 0; + + /* Address of start of the most recently finished expression. + This tells, e.g., postfix * where to find the start of its + operand. Reset at the beginning of groups and alternatives. */ + unsigned char *laststart = 0; + + /* Address of beginning of regexp, or inside of last group. */ + unsigned char *begalt; + + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + const char *beg_interval; + + /* Address of the place where a forward jump should go to the end of + the containing expression. Each alternative of an `or' -- except the + last -- ends with a forward jump of this sort. */ + unsigned char *fixup_alt_jump = 0; + + /* Counts open-groups as they are encountered. Remembered for the + matching close-group on the compile stack, so the same register + number is put in the stop_memory as the start_memory. */ + regnum_t regnum = 0; + +#ifdef DEBUG + DEBUG_PRINT1("\nCompiling pattern: "); + if (debug) { + unsigned debug_count; + + for (debug_count = 0; debug_count < size; debug_count++) + putchar(pattern[debug_count]); + putchar('\n'); + } +#endif /* DEBUG */ + + /* Initialize the compile stack. */ + compile_stack.stack = + TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size = INIT_COMPILE_STACK_SIZE; + compile_stack.avail = 0; + + /* Initialize the pattern buffer. */ + bufp->syntax = syntax; + bufp->fastmap_accurate = 0; + bufp->not_bol = bufp->not_eol = 0; + + /* Set `used' to zero, so that if we return an error, the pattern + printer (for debugging) will think there's no pattern. We reset it + at the end. */ + bufp->used = 0; + + /* Always count groups, whether or not bufp->no_sub is set. */ + bufp->re_nsub = 0; + +#if !defined emacs && !defined SYNTAX_TABLE + /* Initialize the syntax table. */ + init_syntax_once(); +#endif + + if (bufp->allocated == 0) { + if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. */ + RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char); + } else { /* Caller did not allocate a buffer. Do it for them. */ + bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char); + } + if (!bufp->buffer) + FREE_STACK_RETURN(REG_ESPACE); + + bufp->allocated = INIT_BUF_SIZE; + } + + begalt = b = bufp->buffer; + + /* Loop through the uncompiled pattern until we're at the end. */ + while (p != pend) { + PATFETCH(c); + + switch (c) { + case '^': + { + if ( /* If at start of pattern, it's an operator. */ + p == pattern + 1 + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's come before. */ + || at_begline_loc_p(pattern, p, syntax)) + BUF_PUSH(begline); + else + goto normal_char; + } + break; + + + case '$': + { + if ( /* If at end of pattern, it's an operator. */ + p == pend + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's next. */ + || at_endline_loc_p(p, pend, syntax)) + BUF_PUSH(endl