diff options
author | Eric Andersen <andersen@codepoet.org> | 2000-10-20 03:48:11 +0000 |
---|---|---|
committer | Eric Andersen <andersen@codepoet.org> | 2000-10-20 03:48:11 +0000 |
commit | 82d766043c6a8dcf6283788419f110dd7ab52f80 (patch) | |
tree | 09505131008d1b4d2178065878c3e8e0d54c26a2 /libc/misc/regex | |
parent | 5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff) |
A smaller, kinder, gentler regexp implementation.
Diffstat (limited to 'libc/misc/regex')
-rw-r--r-- | libc/misc/regex/Makefile | 2 | ||||
-rw-r--r-- | libc/misc/regex/regex.c | 5725 | ||||
-rw-r--r-- | libc/misc/regex/rx.c | 7273 |
3 files changed, 5726 insertions, 7274 deletions
diff --git a/libc/misc/regex/Makefile b/libc/misc/regex/Makefile index c4c13f6cf..38b7e98bf 100644 --- a/libc/misc/regex/Makefile +++ b/libc/misc/regex/Makefile @@ -24,7 +24,7 @@ TOPDIR=../../ include $(TOPDIR)Rules.mak LIBC=$(TOPDIR)libc.a -CSRC=rx.c +CSRC=regex.c COBJS=$(patsubst %.c,%.o, $(CSRC)) OBJS=$(COBJS) diff --git a/libc/misc/regex/regex.c b/libc/misc/regex/regex.c new file mode 100644 index 000000000..64e754ee0 --- /dev/null +++ b/libc/misc/regex/regex.c @@ -0,0 +1,5725 @@ +/* Extended regular expression matching and search library, + version 0.12. + (Implements POSIX draft P1003.2/D11.2, except for some of the + internationalization features.) + Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* AIX requires this to be the first thing in the file. */ +#if defined _AIX && !defined REGEX_MALLOC +#pragma alloca +#endif + +#undef _GNU_SOURCE +#define _GNU_SOURCE +#define STDC_HEADERS + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#ifndef PARAMS +# if defined __GNUC__ || (defined __STDC__ && __STDC__) +# define PARAMS(args) args +# else +# define PARAMS(args) () +# endif /* GCC. */ +#endif /* Not PARAMS. */ + +#if defined STDC_HEADERS && !defined emacs +# include <stddef.h> +#else +/* We need this for `regex.h', and perhaps for the Emacs include files. */ +# include <sys/types.h> +#endif + +#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) + +/* For platform which support the ISO C amendement 1 functionality we + support user defined character classes. */ +#if defined _LIBC || WIDE_CHAR_SUPPORT +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ +# include <wchar.h> +# include <wctype.h> +#endif + +#ifdef _LIBC +/* We have to keep the namespace clean. */ +# define regfree(preg) __regfree (preg) +# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) +# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) +# define regerror(errcode, preg, errbuf, errbuf_size) \ + __regerror(errcode, preg, errbuf, errbuf_size) +# define re_set_registers(bu, re, nu, st, en) \ + __re_set_registers (bu, re, nu, st, en) +# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ + __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) +# define re_match(bufp, string, size, pos, regs) \ + __re_match (bufp, string, size, pos, regs) +# define re_search(bufp, string, size, startpos, range, regs) \ + __re_search (bufp, string, size, startpos, range, regs) +# define re_compile_pattern(pattern, length, bufp) \ + __re_compile_pattern (pattern, length, bufp) +# define re_set_syntax(syntax) __re_set_syntax (syntax) +# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ + __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) +# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) + +#define btowc __btowc +#endif + +/* This is for other GNU distributions with internationalized messages. */ +#if HAVE_LIBINTL_H || defined _LIBC +# include <libintl.h> +#else +# define gettext(msgid) (msgid) +#endif + +#ifndef gettext_noop +/* This define is so xgettext can find the internationalizable + strings. */ +# define gettext_noop(String) String +#endif + +/* The `emacs' switch turns on certain matching commands + that make sense only in Emacs. */ +#ifdef emacs + +# include "lisp.h" +# include "buffer.h" +# include "syntax.h" + +#else /* not emacs */ + +/* If we are not linking with Emacs proper, + we can't use the relocating allocator + even if config.h says that we can. */ +# undef REL_ALLOC + +# if defined STDC_HEADERS || defined _LIBC +# include <stdlib.h> +# else +char *malloc(); +char *realloc(); +# endif + +/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. + If nothing else has been done, use the method below. */ +# ifdef INHIBIT_STRING_HEADER +# if !(defined HAVE_BZERO && defined HAVE_BCOPY) +# if !defined bzero && !defined bcopy +# undef INHIBIT_STRING_HEADER +# endif +# endif +# endif + +/* This is the normal way of making sure we have a bcopy and a bzero. + This is used in most programs--a few other programs avoid this + by defining INHIBIT_STRING_HEADER. */ +# ifndef INHIBIT_STRING_HEADER +# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC +# include <string.h> +# ifndef bzero +# ifndef _LIBC +# define bzero(s, n) (memset (s, '\0', n), (s)) +# else +# define bzero(s, n) __bzero (s, n) +# endif +# endif +# else +# include <strings.h> +# ifndef memcmp +# define memcmp(s1, s2, n) bcmp (s1, s2, n) +# endif +# ifndef memcpy +# define memcpy(d, s, n) (bcopy (s, d, n), (d)) +# endif +# endif +# endif + +/* Define the syntax stuff for \<, \>, etc. */ + +/* This must be nonzero for the wordchar and notwordchar pattern + commands in re_match_2. */ +# ifndef Sword +# define Sword 1 +# endif + +# ifdef SWITCH_ENUM_BUG +# define SWITCH_ENUM_CAST(x) ((int)(x)) +# else +# define SWITCH_ENUM_CAST(x) (x) +# endif + +#endif /* not emacs */ + +/* Get the interface, including the syntax bits. */ +#include <regex.h> + +/* isalpha etc. are used for the character classes. */ +#include <ctype.h> + +/* Jim Meyering writes: + + "... Some ctype macros are valid only for character codes that + isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when + using /bin/cc or gcc but without giving an ansi option). So, all + ctype uses should be through macros like ISPRINT... If + STDC_HEADERS is defined, then autoconf has verified that the ctype + macros don't need to be guarded with references to isascii. ... + Defining isascii to 1 should let any compiler worth its salt + eliminate the && through constant folding." + Solaris defines some of these symbols so we must undefine them first. */ + +#undef ISASCII +#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) +# define ISASCII(c) 1 +#else +# define ISASCII(c) isascii(c) +#endif + +#ifdef isblank +# define ISBLANK(c) (ISASCII (c) && isblank (c)) +#else +# define ISBLANK(c) ((c) == ' ' || (c) == '\t') +#endif +#ifdef isgraph +# define ISGRAPH(c) (ISASCII (c) && isgraph (c)) +#else +# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) +#endif + +#undef ISPRINT +#define ISPRINT(c) (ISASCII (c) && isprint (c)) +#define ISDIGIT(c) (ISASCII (c) && isdigit (c)) +#define ISALNUM(c) (ISASCII (c) && isalnum (c)) +#define ISALPHA(c) (ISASCII (c) && isalpha (c)) +#define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) +#define ISLOWER(c) (ISASCII (c) && islower (c)) +#define ISPUNCT(c) (ISASCII (c) && ispunct (c)) +#define ISSPACE(c) (ISASCII (c) && isspace (c)) +#define ISUPPER(c) (ISASCII (c) && isupper (c)) +#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) + +#ifdef _tolower +# define TOLOWER(c) _tolower(c) +#else +# define TOLOWER(c) tolower(c) +#endif + +#ifndef NULL +# define NULL (void *)0 +#endif + +/* We remove any previous definition of `SIGN_EXTEND_CHAR', + since ours (we hope) works properly with all combinations of + machines, compilers, `char' and `unsigned char' argument types. + (Per Bothner suggested the basic approach.) */ +#undef SIGN_EXTEND_CHAR +#if __STDC__ +# define SIGN_EXTEND_CHAR(c) ((signed char) (c)) +#else /* not __STDC__ */ +/* As in Harbison and Steele. */ +# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) +#endif + +#ifndef emacs +/* How many characters in the character set. */ +# define CHAR_SET_SIZE 256 + +# ifdef SYNTAX_TABLE + +extern char *re_syntax_table; + +# else /* not SYNTAX_TABLE */ + +static char re_syntax_table[CHAR_SET_SIZE]; + +static void init_syntax_once() +{ + register int c; + static int done = 0; + + if (done) + return; + bzero(re_syntax_table, sizeof re_syntax_table); + + for (c = 0; c < CHAR_SET_SIZE; ++c) + if (ISALNUM(c)) + re_syntax_table[c] = Sword; + + re_syntax_table['_'] = Sword; + + done = 1; +} + +# endif /* not SYNTAX_TABLE */ + +# define SYNTAX(c) re_syntax_table[((c) & 0xFF)] + +#endif /* emacs */ + +/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we + use `alloca' instead of `malloc'. This is because using malloc in + re_search* or re_match* could cause memory leaks when C-g is used in + Emacs; also, malloc is slower and causes storage fragmentation. On + the other hand, malloc is more portable, and easier to debug. + + Because we sometimes use alloca, some routines have to be macros, + not functions -- `alloca'-allocated space disappears at the end of the + function it is called in. */ + +#ifdef REGEX_MALLOC + +# define REGEX_ALLOCATE malloc +# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) +# define REGEX_FREE free + +#else /* not REGEX_MALLOC */ + +/* Emacs already defines alloca, sometimes. */ +# ifndef alloca + +/* Make alloca work the best possible way. */ +# ifdef __GNUC__ +# define alloca __builtin_alloca +# else /* not __GNUC__ */ +# if HAVE_ALLOCA_H +# include <alloca.h> +# endif /* HAVE_ALLOCA_H */ +# endif /* not __GNUC__ */ + +# endif /* not alloca */ + +# define REGEX_ALLOCATE alloca + +/* Assumes a `char *destination' variable. */ +# define REGEX_REALLOCATE(source, osize, nsize) \ + (destination = (char *) alloca (nsize), \ + memcpy (destination, source, osize)) + +/* No need to do anything to free, after alloca. */ +# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ + +#endif /* not REGEX_MALLOC */ + +/* Define how to allocate the failure stack. */ + +#if defined REL_ALLOC && defined REGEX_MALLOC + +# define REGEX_ALLOCATE_STACK(size) \ + r_alloc (&failure_stack_ptr, (size)) +# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ + r_re_alloc (&failure_stack_ptr, (nsize)) +# define REGEX_FREE_STACK(ptr) \ + r_alloc_free (&failure_stack_ptr) + +#else /* not using relocating allocator */ + +# ifdef REGEX_MALLOC + +# define REGEX_ALLOCATE_STACK malloc +# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) +# define REGEX_FREE_STACK free + +# else /* not REGEX_MALLOC */ + +# define REGEX_ALLOCATE_STACK alloca + +# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ + REGEX_REALLOCATE (source, osize, nsize) +/* No need to explicitly free anything. */ +# define REGEX_FREE_STACK(arg) + +# endif /* not REGEX_MALLOC */ +#endif /* not using relocating allocator */ + + +/* True if `size1' is non-NULL and PTR is pointing anywhere inside + `string1' or just past its end. This works if PTR is NULL, which is + a good thing. */ +#define FIRST_STRING_P(ptr) \ + (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) + +/* (Re)Allocate N items of type T using malloc, or fail. */ +#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) +#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) +#define RETALLOC_IF(addr, n, t) \ + if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) +#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) + +#define BYTEWIDTH 8 /* In bits. */ + +#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) + +#undef MAX +#undef MIN +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef char boolean; + +#define false 0 +#define true 1 + +static int re_match_2_internal PARAMS((struct re_pattern_buffer * bufp, + const char *string1, int size1, + const char *string2, int size2, + int pos, + struct re_registers * regs, + + int stop)); + +/* These are the command codes that appear in compiled regular + expressions. Some opcodes are followed by argument bytes. A + command code can specify any interpretation whatsoever for its + arguments. Zero bytes may appear in the compiled regular expression. */ + +typedef enum { + no_op = 0, + + /* Succeed right away--no more backtracking. */ + succeed, + + /* Followed by one byte giving n, then by n literal bytes. */ + exactn, + + /* Matches any (more or less) character. */ + anychar, + + /* Matches any one char belonging to specified set. First + following byte is number of bitmap bytes. Then come bytes + for a bitmap saying which chars are in. Bits in each byte + are ordered low-bit-first. A character is in the set if its + bit is 1. A character too large to have a bit in the map is + automatically not in the set. */ + charset, + + /* Same parameters as charset, but match any character that is + not one of those specified. */ + charset_not, + + /* Start remembering the text that is matched, for storing in a + register. Followed by one byte with the register number, in + the range 0 to one less than the pattern buffer's re_nsub + field. Then followed by one byte with the number of groups + inner to this one. (This last has to be part of the + start_memory only because we need it in the on_failure_jump + of re_match_2.) */ + start_memory, + + /* Stop remembering the text that is matched and store it in a + memory register. Followed by one byte with the register + number, in the range 0 to one less than `re_nsub' in the + pattern buffer, and one byte with the number of inner groups, + just like `start_memory'. (We need the number of inner + groups here because we don't have any easy way of finding the + corresponding start_memory when we're at a stop_memory.) */ + stop_memory, + + /* Match a duplicate of something remembered. Followed by one + byte containing the register number. */ + duplicate, + + /* Fail unless at beginning of line. */ + begline, + + /* Fail unless at end of line. */ + endline, + + /* Succeeds if at beginning of buffer (if emacs) or at beginning + of string to be matched (if not). */ + begbuf, + + /* Analogously, for end of buffer/string. */ + endbuf, + + /* Followed by two byte relative address to which to jump. */ + jump, + + /* Same as jump, but marks the end of an alternative. */ + jump_past_alt, + + /* Followed by two-byte relative address of place to resume at + in case of failure. */ + on_failure_jump, + + /* Like on_failure_jump, but pushes a placeholder instead of the + current string position when executed. */ + on_failure_keep_string_jump, + + /* Throw away latest failure point and then jump to following + two-byte relative address. */ + pop_failure_jump, + + /* Change to pop_failure_jump if know won't have to backtrack to + match; otherwise change to jump. This is used to jump + back to the beginning of a repeat. If what follows this jump + clearly won't match what the repeat does, such that we can be + sure that there is no use backtracking out of repetitions + already matched, then we change it to a pop_failure_jump. + Followed by two-byte address. */ + maybe_pop_jump, + + /* Jump to following two-byte address, and push a dummy failure + point. This failure point will be thrown away if an attempt + is made to use it for a failure. A `+' construct makes this + before the first repeat. Also used as an intermediary kind + of jump when compiling an alternative. */ + dummy_failure_jump, + + /* Push a dummy failure point and continue. Used at the end of + alternatives. */ + push_dummy_failure, + + /* Followed by two-byte relative address and two-byte number n. + After matching N times, jump to the address upon failure. */ + succeed_n, + + /* Followed by two-byte relative address, and two-byte number n. + Jump to the address N times, then fail. */ + jump_n, + + /* Set the following two-byte relative address to the + subsequent two-byte number. The address *includes* the two + bytes of number. */ + set_number_at, + + wordchar, /* Matches any word-constituent character. */ + notwordchar, /* Matches any char that is not a word-constituent. */ + + wordbeg, /* Succeeds if at word beginning. */ + wordend, /* Succeeds if at word end. */ + + wordbound, /* Succeeds if at a word boundary. */ + notwordbound /* Succeeds if not at a word boundary. */ +#ifdef emacs + , before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ + + /* Matches any character whose syntax is specified. Followed by + a byte which contains a syntax code, e.g., Sword. */ + syntaxspec, + + /* Matches any character whose syntax is not that specified. */ + notsyntaxspec +#endif /* emacs */ +} re_opcode_t; + +/* Common operations on the compiled pattern. */ + +/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ + +#define STORE_NUMBER(destination, number) \ + do { \ + (destination)[0] = (number) & 0377; \ + (destination)[1] = (number) >> 8; \ + } while (0) + +/* Same as STORE_NUMBER, except increment DESTINATION to + the byte after where the number is stored. Therefore, DESTINATION + must be an lvalue. */ + +#define STORE_NUMBER_AND_INCR(destination, number) \ + do { \ + STORE_NUMBER (destination, number); \ + (destination) += 2; \ + } while (0) + +/* Put into DESTINATION a number stored in two contiguous bytes starting + at SOURCE. */ + +#define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source) & 0377; \ + (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ + } while (0) + +#ifdef DEBUG +static void extract_number _RE_ARGS((int *dest, unsigned char *source)); +static void extract_number(dest, source) +int *dest; +unsigned char *source; +{ + int temp = SIGN_EXTEND_CHAR(*(source + 1)); + + *dest = *source & 0377; + *dest += temp << 8; +} + +# ifndef EXTRACT_MACROS /* To debug the macros. */ +# undef EXTRACT_NUMBER +# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) +# endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. + SOURCE must be an lvalue. */ + +#define EXTRACT_NUMBER_AND_INCR(destination, source) \ + do { \ + EXTRACT_NUMBER (destination, source); \ + (source) += 2; \ + } while (0) + +#ifdef DEBUG +static void extract_number_and_incr _RE_ARGS((int *destination, + unsigned char **source)); +static void extract_number_and_incr(destination, source) +int *destination; +unsigned char **source; +{ + extract_number(destination, *source); + *source += 2; +} + +# ifndef EXTRACT_MACROS +# undef EXTRACT_NUMBER_AND_INCR +# define EXTRACT_NUMBER_AND_INCR(dest, src) \ + extract_number_and_incr (&dest, &src) +# endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* If DEBUG is defined, Regex prints many voluminous messages about what + it is doing (if the variable `debug' is nonzero). If linked with the + main program in `iregex.c', you can enter patterns and strings + interactively. And if linked with the main program in `main.c' and + the other test files, you can run the already-written tests. */ + +#ifdef DEBUG + +/* We use standard I/O for debugging. */ +# include <stdio.h> + +/* It is useful to test things that ``must'' be true when debugging. */ +# include <assert.h> + +static int debug; + +# define DEBUG_STATEMENT(e) e +# define DEBUG_PRINT1(x) if (debug) printf (x) +# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) +# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) +# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) +# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (debug) print_partial_compiled_pattern (s, e) +# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (debug) print_double_string (w, s1, sz1, s2, sz2) + + +/* Print the fastmap in human-readable form. */ + +void print_fastmap(fastmap) +char *fastmap; +{ + unsigned was_a_range = 0; + unsigned i = 0; + + while (i < (1 << BYTEWIDTH)) { + if (fastmap[i++]) { + was_a_range = 0; + putchar(i - 1); + while (i < (1 << BYTEWIDTH) && fastmap[i]) { + was_a_range = 1; + i++; + } + if (was_a_range) { + printf("-"); + putchar(i - 1); + } + } + } + putchar('\n'); +} + + +/* Print a compiled pattern string in human-readable form, starting at + the START pointer into it and ending just before the pointer END. */ + +void print_partial_compiled_pattern(start, end) +unsigned char *start; +unsigned char *end; +{ + int mcnt, mcnt2; + unsigned char *p1; + unsigned char *p = start; + unsigned char *pend = end; + + if (start == NULL) { + printf("(null)\n"); + return; + } + + /* Loop over pattern commands. */ + while (p < pend) { + printf("%d:\t", p - start); + + switch ((re_opcode_t) * p++) { + case no_op: + printf("/no_op"); + break; + + case exactn: + mcnt = *p++; + printf("/exactn/%d", mcnt); + do { + putchar('/'); + putchar(*p++); + } + while (--mcnt); + break; + + case start_memory: + mcnt = *p++; + printf("/start_memory/%d/%d", mcnt, *p++); + break; + + case stop_memory: + mcnt = *p++; + printf("/stop_memory/%d/%d", mcnt, *p++); + break; + + case duplicate: + printf("/duplicate/%d", *p++); + break; + + case anychar: + printf("/anychar"); + break; + + case charset: + case charset_not: + { + register int c, last = -100; + register int in_range = 0; + + printf("/charset [%s", + (re_opcode_t) * (p - 1) == charset_not ? "^" : ""); + + assert(p + *p < pend); + + for (c = 0; c < 256; c++) + if (c / 8 < *p && (p[1 + (c / 8)] & (1 << (c % 8)))) { + /* Are we starting a range? */ + if (last + 1 == c && !in_range) { + putchar('-'); + in_range = 1; + } + /* Have we broken a range? */ + else if (last + 1 != c && in_range) { + putchar(last); + in_range = 0; + } + + if (!in_range) + putchar(c); + + last = c; + } + + if (in_range) + putchar(last); + + putchar(']'); + + p += 1 + *p; + } + break; + + case begline: + printf("/begline"); + break; + + case endline: + printf("/endline"); + break; + + case on_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/on_failure_jump to %d", p + mcnt - start); + break; + + case on_failure_keep_string_jump: + extract_number_and_incr(&mcnt, &p); + printf("/on_failure_keep_string_jump to %d", p + mcnt - start); + break; + + case dummy_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/dummy_failure_jump to %d", p + mcnt - start); + break; + + case push_dummy_failure: + printf("/push_dummy_failure"); + break; + + case maybe_pop_jump: + extract_number_and_incr(&mcnt, &p); + printf("/maybe_pop_jump to %d", p + mcnt - start); + break; + + case pop_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/pop_failure_jump to %d", p + mcnt - start); + break; + + case jump_past_alt: + extract_number_and_incr(&mcnt, &p); + printf("/jump_past_alt to %d", p + mcnt - start); + break; + + case jump: + extract_number_and_incr(&mcnt, &p); + printf("/jump to %d", p + mcnt - start); + break; + + case succeed_n: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/succeed_n to %d, %d times", p1 - start, mcnt2); + break; + + case jump_n: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/jump_n to %d, %d times", p1 - start, mcnt2); + break; + + case set_number_at: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/set_number_at location %d to %d", p1 - start, mcnt2); + break; + + case wordbound: + printf("/wordbound"); + break; + + case notwordbound: + printf("/notwordbound"); + break; + + case wordbeg: + printf("/wordbeg"); + break; + + case wordend: + printf("/wordend"); + +# ifdef emacs + case before_dot: + printf("/before_dot"); + break; + + case at_dot: + printf("/at_dot"); + break; + + case after_dot: + printf("/after_dot"); + break; + + case syntaxspec: + printf("/syntaxspec"); + mcnt = *p++; + printf("/%d", mcnt); + break; + + case notsyntaxspec: + printf("/notsyntaxspec"); + mcnt = *p++; + printf("/%d", mcnt); + break; +# endif /* emacs */ + + case wordchar: + printf("/wordchar"); + break; + + case notwordchar: + printf("/notwordchar"); + break; + + case begbuf: + printf("/begbuf"); + break; + + case endbuf: + printf("/endbuf"); + break; + + default: + printf("?%d", *(p - 1)); + } + + putchar('\n'); + } + + printf("%d:\tend of pattern.\n", p - start); +} + + +void print_compiled_pattern(bufp) +struct re_pattern_buffer *bufp; +{ + unsigned char *buffer = bufp->buffer; + + print_partial_compiled_pattern(buffer, buffer + bufp->used); + printf("%ld bytes used/%ld bytes allocated.\n", + bufp->used, bufp->allocated); + + if (bufp->fastmap_accurate && bufp->fastmap) { + printf("fastmap: "); + print_fastmap(bufp->fastmap); + } + + printf("re_nsub: %d\t", bufp->re_nsub); + printf("regs_alloc: %d\t", bufp->regs_allocated); + printf("can_be_null: %d\t", bufp->can_be_null); + printf("newline_anchor: %d\n", bufp->newline_anchor); + printf("no_sub: %d\t", bufp->no_sub); + printf("not_bol: %d\t", bufp->not_bol); + printf("not_eol: %d\t", bufp->not_eol); + printf("syntax: %lx\n", bufp->syntax); + /* Perhaps we should print the translate table? */ +} + + +void print_double_string(where, string1, size1, string2, size2) +const char *where; +const char *string1; +const char *string2; +int size1; +int size2; +{ + int this_char; + + if (where == NULL) + printf("(null)"); + else { + if (FIRST_STRING_P(where)) { + for (this_char = where - string1; this_char < size1; + this_char++) + putchar(string1[this_char]); + + where = string2; + } + + for (this_char = where - string2; this_char < size2; this_char++) + putchar(string2[this_char]); + } +} + +void printchar(c) +int c; +{ + putc(c, stderr); +} + +#else /* not DEBUG */ + +# undef assert +# define assert(e) + +# define DEBUG_STATEMENT(e) +# define DEBUG_PRINT1(x) +# define DEBUG_PRINT2(x1, x2) +# define DEBUG_PRINT3(x1, x2, x3) +# define DEBUG_PRINT4(x1, x2, x3, x4) +# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) +# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) + +#endif /* not DEBUG */ + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +/* This has no initializer because initialized variables in Emacs + become read-only after dumping. */ +reg_syntax_t re_syntax_options; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t re_set_syntax(syntax) +reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; +#ifdef DEBUG + if (syntax & RE_DEBUG) + debug = 1; + else if (debug) /* was on but now is not */ + debug = 0; +#endif /* DEBUG */ + return ret; +} + +#ifdef _LIBC +weak_alias(__re_set_syntax, re_set_syntax) +#endif +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. + POSIX doesn't require that we do anything for REG_NOERROR, + but why not be nice? */ +static const char re_error_msgid[] = { +#define REG_NOERROR_IDX 0 + gettext_noop("Success") /* REG_NOERROR */ + "\0" +#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") + gettext_noop("No match") /* REG_NOMATCH */ + "\0" +#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") + gettext_noop("Invalid regular expression") /* REG_BADPAT */ + "\0" +#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") + gettext_noop("Invalid collation character") /* REG_ECOLLATE */ + "\0" +#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") + gettext_noop("Invalid character class name") /* REG_ECTYPE */ + "\0" +#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") + gettext_noop("Trailing backslash") /* REG_EESCAPE */ + "\0" +#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") + gettext_noop("Invalid back reference") /* REG_ESUBREG */ + "\0" +#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") + gettext_noop("Unmatched [ or [^") /* REG_EBRACK */ + "\0" +#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") + gettext_noop("Unmatched ( or \\(") /* REG_EPAREN */ + "\0" +#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") + gettext_noop("Unmatched \\{") /* REG_EBRACE */ + "\0" +#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") + gettext_noop("Invalid content of \\{\\}") /* REG_BADBR */ + "\0" +#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") + gettext_noop("Invalid range end") /* REG_ERANGE */ + "\0" +#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") + gettext_noop("Memory exhausted") /* REG_ESPACE */ + "\0" +#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") + gettext_noop("Invalid preceding regular expression") /* REG_BADRPT */ + "\0" +#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") + gettext_noop("Premature end of regular expression") /* REG_EEND */ + "\0" +#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") + gettext_noop("Regular expression too big") /* REG_ESIZE */ + "\0" +#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") + gettext_noop("Unmatched ) or \\)") /* REG_ERPAREN */ +}; + +static const size_t re_error_msgid_idx[] = { + REG_NOERROR_IDX, + REG_NOMATCH_IDX, + REG_BADPAT_IDX, + REG_ECOLLATE_IDX, + REG_ECTYPE_IDX, + REG_EESCAPE_IDX, + REG_ESUBREG_IDX, + REG_EBRACK_IDX, + REG_EPAREN_IDX, + REG_EBRACE_IDX, + REG_BADBR_IDX, + REG_ERANGE_IDX, + REG_ESPACE_IDX, + REG_BADRPT_IDX, + REG_EEND_IDX, + REG_ESIZE_IDX, + REG_ERPAREN_IDX +}; + +/* Avoiding alloca during matching, to placate r_alloc. */ + +/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the + searching and matching functions should not call alloca. On some + systems, alloca is implemented in terms of malloc, and if we're + using the relocating allocator routines, then malloc could cause a + relocation, which might (if the strings being searched are in the + ralloc heap) shift the data out from underneath the regexp + routines. + + Here's another reason to avoid allocation: Emacs + processes input from X in a signal handler; processing X input may + call malloc; if input arrives while a matching routine is calling + malloc, then we're scrod. But Emacs can't just block input while + calling matching routines; then we don't notice interrupts when + they come in. So, Emacs blocks input around all regexp calls + except the matching calls, which it leaves unprotected, in the + faith that they will not malloc. */ + +/* Normally, this is fine. */ +#define MATCH_MAY_ALLOCATE + +/* When using GNU C, we are not REALLY using the C alloca, no matter + what config.h may say. So don't take precautions for it. */ +#ifdef __GNUC__ +# undef C_ALLOCA +#endif + +/* The match routines may not allocate if (1) they would do it with malloc + and (2) it's not safe for them to use malloc. + Note that if REL_ALLOC is defined, matching would not use malloc for the + failure stack, b |