A smaller, kinder, gentler regexp implementation.

author: Eric Andersen <andersen@codepoet.org> 2000-10-20 03:48:11 +0000
committer: Eric Andersen <andersen@codepoet.org> 2000-10-20 03:48:11 +0000
commit: 82d766043c6a8dcf6283788419f110dd7ab52f80 (patch)
tree: 09505131008d1b4d2178065878c3e8e0d54c26a2 /libc/misc
parent: 5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff)
3 files changed, 5726 insertions, 7274 deletions
diff --git a/libc/misc/regex/Makefile b/libc/misc/regex/Makefile
index c4c13f6cf..38b7e98bf 100644
--- a/libc/misc/regex/Makefile
+++ b/libc/misc/regex/Makefile
@@ -24,7 +24,7 @@ TOPDIR=../../
 include $(TOPDIR)Rules.mak
 LIBC=$(TOPDIR)libc.a
 
-CSRC=rx.c
+CSRC=regex.c
 COBJS=$(patsubst %.c,%.o, $(CSRC))
 OBJS=$(COBJS)
 
diff --git a/libc/misc/regex/regex.c b/libc/misc/regex/regex.c
new file mode 100644
index 000000000..64e754ee0
--- /dev/null
+++ b/libc/misc/regex/regex.c
@@ -0,0 +1,5725 @@
+/* Extended regular expression matching and search library,
+   version 0.12.
+   (Implements POSIX draft P1003.2/D11.2, except for some of the
+   internationalization features.)
+   Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+/* AIX requires this to be the first thing in the file. */
+#if defined _AIX && !defined REGEX_MALLOC
+#pragma alloca
+#endif
+
+#undef	_GNU_SOURCE
+#define _GNU_SOURCE
+#define STDC_HEADERS
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#ifndef PARAMS
+# if defined __GNUC__ || (defined __STDC__ && __STDC__)
+#  define PARAMS(args) args
+# else
+#  define PARAMS(args) ()
+# endif							/* GCC.  */
+#endif							/* Not PARAMS.  */
+
+#if defined STDC_HEADERS && !defined emacs
+# include <stddef.h>
+#else
+/* We need this for `regex.h', and perhaps for the Emacs include files.  */
+# include <sys/types.h>
+#endif
+
+#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
+
+/* For platform which support the ISO C amendement 1 functionality we
+   support user defined character classes.  */
+#if defined _LIBC || WIDE_CHAR_SUPPORT
+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
+#ifdef _LIBC
+/* We have to keep the namespace clean.  */
+# define regfree(preg) __regfree (preg)
+# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
+# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
+# define regerror(errcode, preg, errbuf, errbuf_size) \
+	__regerror(errcode, preg, errbuf, errbuf_size)
+# define re_set_registers(bu, re, nu, st, en) \
+	__re_set_registers (bu, re, nu, st, en)
+# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
+	__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
+# define re_match(bufp, string, size, pos, regs) \
+	__re_match (bufp, string, size, pos, regs)
+# define re_search(bufp, string, size, startpos, range, regs) \
+	__re_search (bufp, string, size, startpos, range, regs)
+# define re_compile_pattern(pattern, length, bufp) \
+	__re_compile_pattern (pattern, length, bufp)
+# define re_set_syntax(syntax) __re_set_syntax (syntax)
+# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
+	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
+# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
+
+#define btowc __btowc
+#endif
+
+/* This is for other GNU distributions with internationalized messages.  */
+#if HAVE_LIBINTL_H || defined _LIBC
+# include <libintl.h>
+#else
+# define gettext(msgid) (msgid)
+#endif
+
+#ifndef gettext_noop
+/* This define is so xgettext can find the internationalizable
+   strings.  */
+# define gettext_noop(String) String
+#endif
+
+/* The `emacs' switch turns on certain matching commands
+   that make sense only in Emacs. */
+#ifdef emacs
+
+# include "lisp.h"
+# include "buffer.h"
+# include "syntax.h"
+
+#else							/* not emacs */
+
+/* If we are not linking with Emacs proper,
+   we can't use the relocating allocator
+   even if config.h says that we can.  */
+# undef REL_ALLOC
+
+# if defined STDC_HEADERS || defined _LIBC
+#  include <stdlib.h>
+# else
+char *malloc();
+char *realloc();
+# endif
+
+/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
+   If nothing else has been done, use the method below.  */
+# ifdef INHIBIT_STRING_HEADER
+#  if !(defined HAVE_BZERO && defined HAVE_BCOPY)
+#   if !defined bzero && !defined bcopy
+#    undef INHIBIT_STRING_HEADER
+#   endif
+#  endif
+# endif
+
+/* This is the normal way of making sure we have a bcopy and a bzero.
+   This is used in most programs--a few other programs avoid this
+   by defining INHIBIT_STRING_HEADER.  */
+# ifndef INHIBIT_STRING_HEADER
+#  if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
+#   include <string.h>
+#   ifndef bzero
+#    ifndef _LIBC
+#     define bzero(s, n)	(memset (s, '\0', n), (s))
+#    else
+#     define bzero(s, n)	__bzero (s, n)
+#    endif
+#   endif
+#  else
+#   include <strings.h>
+#   ifndef memcmp
+#    define memcmp(s1, s2, n)	bcmp (s1, s2, n)
+#   endif
+#   ifndef memcpy
+#    define memcpy(d, s, n)	(bcopy (s, d, n), (d))
+#   endif
+#  endif
+# endif
+
+/* Define the syntax stuff for \<, \>, etc.  */
+
+/* This must be nonzero for the wordchar and notwordchar pattern
+   commands in re_match_2.  */
+# ifndef Sword
+#  define Sword 1
+# endif
+
+# ifdef SWITCH_ENUM_BUG
+#  define SWITCH_ENUM_CAST(x) ((int)(x))
+# else
+#  define SWITCH_ENUM_CAST(x) (x)
+# endif
+
+#endif							/* not emacs */
+
+/* Get the interface, including the syntax bits.  */
+#include <regex.h>
+
+/* isalpha etc. are used for the character classes.  */
+#include <ctype.h>
+
+/* Jim Meyering writes:
+
+   "... Some ctype macros are valid only for character codes that
+   isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
+   using /bin/cc or gcc but without giving an ansi option).  So, all
+   ctype uses should be through macros like ISPRINT...  If
+   STDC_HEADERS is defined, then autoconf has verified that the ctype
+   macros don't need to be guarded with references to isascii. ...
+   Defining isascii to 1 should let any compiler worth its salt
+   eliminate the && through constant folding."
+   Solaris defines some of these symbols so we must undefine them first.  */
+
+#undef ISASCII
+#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
+# define ISASCII(c) 1
+#else
+# define ISASCII(c) isascii(c)
+#endif
+
+#ifdef isblank
+# define ISBLANK(c) (ISASCII (c) && isblank (c))
+#else
+# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
+#endif
+#ifdef isgraph
+# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
+#else
+# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
+#endif
+
+#undef ISPRINT
+#define ISPRINT(c) (ISASCII (c) && isprint (c))
+#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
+#define ISALNUM(c) (ISASCII (c) && isalnum (c))
+#define ISALPHA(c) (ISASCII (c) && isalpha (c))
+#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
+#define ISLOWER(c) (ISASCII (c) && islower (c))
+#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
+#define ISSPACE(c) (ISASCII (c) && isspace (c))
+#define ISUPPER(c) (ISASCII (c) && isupper (c))
+#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
+
+#ifdef _tolower
+# define TOLOWER(c) _tolower(c)
+#else
+# define TOLOWER(c) tolower(c)
+#endif
+
+#ifndef NULL
+# define NULL (void *)0
+#endif
+
+/* We remove any previous definition of `SIGN_EXTEND_CHAR',
+   since ours (we hope) works properly with all combinations of
+   machines, compilers, `char' and `unsigned char' argument types.
+   (Per Bothner suggested the basic approach.)  */
+#undef SIGN_EXTEND_CHAR
+#if __STDC__
+# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
+#else							/* not __STDC__ */
+/* As in Harbison and Steele.  */
+# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
+#endif
+
+#ifndef emacs
+/* How many characters in the character set.  */
+# define CHAR_SET_SIZE 256
+
+# ifdef SYNTAX_TABLE
+
+extern char *re_syntax_table;
+
+# else							/* not SYNTAX_TABLE */
+
+static char re_syntax_table[CHAR_SET_SIZE];
+
+static void init_syntax_once()
+{
+	register int c;
+	static int done = 0;
+
+	if (done)
+		return;
+	bzero(re_syntax_table, sizeof re_syntax_table);
+
+	for (c = 0; c < CHAR_SET_SIZE; ++c)
+		if (ISALNUM(c))
+			re_syntax_table[c] = Sword;
+
+	re_syntax_table['_'] = Sword;
+
+	done = 1;
+}
+
+# endif							/* not SYNTAX_TABLE */
+
+# define SYNTAX(c) re_syntax_table[((c) & 0xFF)]
+
+#endif							/* emacs */
+
+/* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
+   use `alloca' instead of `malloc'.  This is because using malloc in
+   re_search* or re_match* could cause memory leaks when C-g is used in
+   Emacs; also, malloc is slower and causes storage fragmentation.  On
+   the other hand, malloc is more portable, and easier to debug.
+
+   Because we sometimes use alloca, some routines have to be macros,
+   not functions -- `alloca'-allocated space disappears at the end of the
+   function it is called in.  */
+
+#ifdef REGEX_MALLOC
+
+# define REGEX_ALLOCATE malloc
+# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
+# define REGEX_FREE free
+
+#else							/* not REGEX_MALLOC  */
+
+/* Emacs already defines alloca, sometimes.  */
+# ifndef alloca
+
+/* Make alloca work the best possible way.  */
+#  ifdef __GNUC__
+#   define alloca __builtin_alloca
+#  else							/* not __GNUC__ */
+#   if HAVE_ALLOCA_H
+#    include <alloca.h>
+#   endif						/* HAVE_ALLOCA_H */
+#  endif						/* not __GNUC__ */
+
+# endif							/* not alloca */
+
+# define REGEX_ALLOCATE alloca
+
+/* Assumes a `char *destination' variable.  */
+# define REGEX_REALLOCATE(source, osize, nsize)				\
+  (destination = (char *) alloca (nsize),				\
+   memcpy (destination, source, osize))
+
+/* No need to do anything to free, after alloca.  */
+# define REGEX_FREE(arg) ((void)0)	/* Do nothing!  But inhibit gcc warning.  */
+
+#endif							/* not REGEX_MALLOC */
+
+/* Define how to allocate the failure stack.  */
+
+#if defined REL_ALLOC && defined REGEX_MALLOC
+
+# define REGEX_ALLOCATE_STACK(size)				\
+  r_alloc (&failure_stack_ptr, (size))
+# define REGEX_REALLOCATE_STACK(source, osize, nsize)		\
+  r_re_alloc (&failure_stack_ptr, (nsize))
+# define REGEX_FREE_STACK(ptr)					\
+  r_alloc_free (&failure_stack_ptr)
+
+#else							/* not using relocating allocator */
+
+# ifdef REGEX_MALLOC
+
+#  define REGEX_ALLOCATE_STACK malloc
+#  define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
+#  define REGEX_FREE_STACK free
+
+# else							/* not REGEX_MALLOC */
+
+#  define REGEX_ALLOCATE_STACK alloca
+
+#  define REGEX_REALLOCATE_STACK(source, osize, nsize)			\
+   REGEX_REALLOCATE (source, osize, nsize)
+/* No need to explicitly free anything.  */
+#  define REGEX_FREE_STACK(arg)
+
+# endif							/* not REGEX_MALLOC */
+#endif							/* not using relocating allocator */
+
+
+/* True if `size1' is non-NULL and PTR is pointing anywhere inside
+   `string1' or just past its end.  This works if PTR is NULL, which is
+   a good thing.  */
+#define FIRST_STRING_P(ptr) 					\
+  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
+
+/* (Re)Allocate N items of type T using malloc, or fail.  */
+#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
+#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
+#define RETALLOC_IF(addr, n, t) \
+  if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
+#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
+
+#define BYTEWIDTH 8				/* In bits.  */
+
+#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
+
+#undef MAX
+#undef MIN
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+typedef char boolean;
+
+#define false 0
+#define true 1
+
+static int re_match_2_internal PARAMS((struct re_pattern_buffer * bufp,
+									   const char *string1, int size1,
+									   const char *string2, int size2,
+									   int pos,
+									   struct re_registers * regs,
+
+									   int stop));
+
+/* These are the command codes that appear in compiled regular
+   expressions.  Some opcodes are followed by argument bytes.  A
+   command code can specify any interpretation whatsoever for its
+   arguments.  Zero bytes may appear in the compiled regular expression.  */
+
+typedef enum {
+	no_op = 0,
+
+	/* Succeed right away--no more backtracking.  */
+	succeed,
+
+	/* Followed by one byte giving n, then by n literal bytes.  */
+	exactn,
+
+	/* Matches any (more or less) character.  */
+	anychar,
+
+	/* Matches any one char belonging to specified set.  First
+	   following byte is number of bitmap bytes.  Then come bytes
+	   for a bitmap saying which chars are in.  Bits in each byte
+	   are ordered low-bit-first.  A character is in the set if its
+	   bit is 1.  A character too large to have a bit in the map is
+	   automatically not in the set.  */
+	charset,
+
+	/* Same parameters as charset, but match any character that is
+	   not one of those specified.  */
+	charset_not,
+
+	/* Start remembering the text that is matched, for storing in a
+	   register.  Followed by one byte with the register number, in
+	   the range 0 to one less than the pattern buffer's re_nsub
+	   field.  Then followed by one byte with the number of groups
+	   inner to this one.  (This last has to be part of the
+	   start_memory only because we need it in the on_failure_jump
+	   of re_match_2.)  */
+	start_memory,
+
+	/* Stop remembering the text that is matched and store it in a
+	   memory register.  Followed by one byte with the register
+	   number, in the range 0 to one less than `re_nsub' in the
+	   pattern buffer, and one byte with the number of inner groups,
+	   just like `start_memory'.  (We need the number of inner
+	   groups here because we don't have any easy way of finding the
+	   corresponding start_memory when we're at a stop_memory.)  */
+	stop_memory,
+
+	/* Match a duplicate of something remembered. Followed by one
+	   byte containing the register number.  */
+	duplicate,
+
+	/* Fail unless at beginning of line.  */
+	begline,
+
+	/* Fail unless at end of line.  */
+	endline,
+
+	/* Succeeds if at beginning of buffer (if emacs) or at beginning
+	   of string to be matched (if not).  */
+	begbuf,
+
+	/* Analogously, for end of buffer/string.  */
+	endbuf,
+
+	/* Followed by two byte relative address to which to jump.  */
+	jump,
+
+	/* Same as jump, but marks the end of an alternative.  */
+	jump_past_alt,
+
+	/* Followed by two-byte relative address of place to resume at
+	   in case of failure.  */
+	on_failure_jump,
+
+	/* Like on_failure_jump, but pushes a placeholder instead of the
+	   current string position when executed.  */
+	on_failure_keep_string_jump,
+
+	/* Throw away latest failure point and then jump to following
+	   two-byte relative address.  */
+	pop_failure_jump,
+
+	/* Change to pop_failure_jump if know won't have to backtrack to
+	   match; otherwise change to jump.  This is used to jump
+	   back to the beginning of a repeat.  If what follows this jump
+	   clearly won't match what the repeat does, such that we can be
+	   sure that there is no use backtracking out of repetitions
+	   already matched, then we change it to a pop_failure_jump.
+	   Followed by two-byte address.  */
+	maybe_pop_jump,
+
+	/* Jump to following two-byte address, and push a dummy failure
+	   point. This failure point will be thrown away if an attempt
+	   is made to use it for a failure.  A `+' construct makes this
+	   before the first repeat.  Also used as an intermediary kind
+	   of jump when compiling an alternative.  */
+	dummy_failure_jump,
+
+	/* Push a dummy failure point and continue.  Used at the end of
+	   alternatives.  */
+	push_dummy_failure,
+
+	/* Followed by two-byte relative address and two-byte number n.
+	   After matching N times, jump to the address upon failure.  */
+	succeed_n,
+
+	/* Followed by two-byte relative address, and two-byte number n.
+	   Jump to the address N times, then fail.  */
+	jump_n,
+
+	/* Set the following two-byte relative address to the
+	   subsequent two-byte number.  The address *includes* the two
+	   bytes of number.  */
+	set_number_at,
+
+	wordchar,					/* Matches any word-constituent character.  */
+	notwordchar,				/* Matches any char that is not a word-constituent.  */
+
+	wordbeg,					/* Succeeds if at word beginning.  */
+	wordend,					/* Succeeds if at word end.  */
+
+	wordbound,					/* Succeeds if at a word boundary.  */
+	notwordbound				/* Succeeds if not at a word boundary.  */
+#ifdef emacs
+		, before_dot,			/* Succeeds if before point.  */
+	at_dot,						/* Succeeds if at point.  */
+	after_dot,					/* Succeeds if after point.  */
+
+	/* Matches any character whose syntax is specified.  Followed by
+	   a byte which contains a syntax code, e.g., Sword.  */
+	syntaxspec,
+
+	/* Matches any character whose syntax is not that specified.  */
+	notsyntaxspec
+#endif							/* emacs */
+} re_opcode_t;
+
+/* Common operations on the compiled pattern.  */
+
+/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */
+
+#define STORE_NUMBER(destination, number)				\
+  do {									\
+    (destination)[0] = (number) & 0377;					\
+    (destination)[1] = (number) >> 8;					\
+  } while (0)
+
+/* Same as STORE_NUMBER, except increment DESTINATION to
+   the byte after where the number is stored.  Therefore, DESTINATION
+   must be an lvalue.  */
+
+#define STORE_NUMBER_AND_INCR(destination, number)			\
+  do {									\
+    STORE_NUMBER (destination, number);					\
+    (destination) += 2;							\
+  } while (0)
+
+/* Put into DESTINATION a number stored in two contiguous bytes starting
+   at SOURCE.  */
+
+#define EXTRACT_NUMBER(destination, source)				\
+  do {									\
+    (destination) = *(source) & 0377;					\
+    (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8;		\
+  } while (0)
+
+#ifdef DEBUG
+static void extract_number _RE_ARGS((int *dest, unsigned char *source));
+static void extract_number(dest, source)
+int *dest;
+unsigned char *source;
+{
+	int temp = SIGN_EXTEND_CHAR(*(source + 1));
+
+	*dest = *source & 0377;
+	*dest += temp << 8;
+}
+
+# ifndef EXTRACT_MACROS			/* To debug the macros.  */
+#  undef EXTRACT_NUMBER
+#  define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
+# endif							/* not EXTRACT_MACROS */
+
+#endif							/* DEBUG */
+
+/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
+   SOURCE must be an lvalue.  */
+
+#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
+  do {									\
+    EXTRACT_NUMBER (destination, source);				\
+    (source) += 2; 							\
+  } while (0)
+
+#ifdef DEBUG
+static void extract_number_and_incr _RE_ARGS((int *destination,
+											  unsigned char **source));
+static void extract_number_and_incr(destination, source)
+int *destination;
+unsigned char **source;
+{
+	extract_number(destination, *source);
+	*source += 2;
+}
+
+# ifndef EXTRACT_MACROS
+#  undef EXTRACT_NUMBER_AND_INCR
+#  define EXTRACT_NUMBER_AND_INCR(dest, src) \
+  extract_number_and_incr (&dest, &src)
+# endif							/* not EXTRACT_MACROS */
+
+#endif							/* DEBUG */
+
+/* If DEBUG is defined, Regex prints many voluminous messages about what
+   it is doing (if the variable `debug' is nonzero).  If linked with the
+   main program in `iregex.c', you can enter patterns and strings
+   interactively.  And if linked with the main program in `main.c' and
+   the other test files, you can run the already-written tests.  */
+
+#ifdef DEBUG
+
+/* We use standard I/O for debugging.  */
+# include <stdio.h>
+
+/* It is useful to test things that ``must'' be true when debugging.  */
+# include <assert.h>
+
+static int debug;
+
+# define DEBUG_STATEMENT(e) e
+# define DEBUG_PRINT1(x) if (debug) printf (x)
+# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
+# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
+# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
+# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 				\
+  if (debug) print_partial_compiled_pattern (s, e)
+# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
+  if (debug) print_double_string (w, s1, sz1, s2, sz2)
+
+
+/* Print the fastmap in human-readable form.  */
+
+void print_fastmap(fastmap)
+char *fastmap;
+{
+	unsigned was_a_range = 0;
+	unsigned i = 0;
+
+	while (i < (1 << BYTEWIDTH)) {
+		if (fastmap[i++]) {
+			was_a_range = 0;
+			putchar(i - 1);
+			while (i < (1 << BYTEWIDTH) && fastmap[i]) {
+				was_a_range = 1;
+				i++;
+			}
+			if (was_a_range) {
+				printf("-");
+				putchar(i - 1);
+			}
+		}
+	}
+	putchar('\n');
+}
+
+
+/* Print a compiled pattern string in human-readable form, starting at
+   the START pointer into it and ending just before the pointer END.  */
+
+void print_partial_compiled_pattern(start, end)
+unsigned char *start;
+unsigned char *end;
+{
+	int mcnt, mcnt2;
+	unsigned char *p1;
+	unsigned char *p = start;
+	unsigned char *pend = end;
+
+	if (start == NULL) {
+		printf("(null)\n");
+		return;
+	}
+
+	/* Loop over pattern commands.  */
+	while (p < pend) {
+		printf("%d:\t", p - start);
+
+		switch ((re_opcode_t) * p++) {
+		case no_op:
+			printf("/no_op");
+			break;
+
+		case exactn:
+			mcnt = *p++;
+			printf("/exactn/%d", mcnt);
+			do {
+				putchar('/');
+				putchar(*p++);
+			}
+			while (--mcnt);
+			break;
+
+		case start_memory:
+			mcnt = *p++;
+			printf("/start_memory/%d/%d", mcnt, *p++);
+			break;
+
+		case stop_memory:
+			mcnt = *p++;
+			printf("/stop_memory/%d/%d", mcnt, *p++);
+			break;
+
+		case duplicate:
+			printf("/duplicate/%d", *p++);
+			break;
+
+		case anychar:
+			printf("/anychar");
+			break;
+
+		case charset:
+		case charset_not:
+		{
+			register int c, last = -100;
+			register int in_range = 0;
+
+			printf("/charset [%s",
+				   (re_opcode_t) * (p - 1) == charset_not ? "^" : "");
+
+			assert(p + *p < pend);
+
+			for (c = 0; c < 256; c++)
+				if (c / 8 < *p && (p[1 + (c / 8)] & (1 << (c % 8)))) {
+					/* Are we starting a range?  */
+					if (last + 1 == c && !in_range) {
+						putchar('-');
+						in_range = 1;
+					}
+					/* Have we broken a range?  */
+					else if (last + 1 != c && in_range) {
+						putchar(last);
+						in_range = 0;
+					}
+
+					if (!in_range)
+						putchar(c);
+
+					last = c;
+				}
+
+			if (in_range)
+				putchar(last);
+
+			putchar(']');
+
+			p += 1 + *p;
+		}
+			break;
+
+		case begline:
+			printf("/begline");
+			break;
+
+		case endline:
+			printf("/endline");
+			break;
+
+		case on_failure_jump:
+			extract_number_and_incr(&mcnt, &p);
+			printf("/on_failure_jump to %d", p + mcnt - start);
+			break;
+
+		case on_failure_keep_string_jump:
+			extract_number_and_incr(&mcnt, &p);
+			printf("/on_failure_keep_string_jump to %d", p + mcnt - start);
+			break;
+
+		case dummy_failure_jump:
+			extract_number_and_incr(&mcnt, &p);
+			printf("/dummy_failure_jump to %d", p + mcnt - start);
+			break;
+
+		case push_dummy_failure:
+			printf("/push_dummy_failure");
+			break;
+
+		case maybe_pop_jump:
+			extract_number_and_incr(&mcnt, &p);
+			printf("/maybe_pop_jump to %d", p + mcnt - start);
+			break;
+
+		case pop_failure_jump:
+			extract_number_and_incr(&mcnt, &p);
+			printf("/pop_failure_jump to %d", p + mcnt - start);
+			break;
+
+		case jump_past_alt:
+			extract_number_and_incr(&mcnt, &p);
+			printf("/jump_past_alt to %d", p + mcnt - start);
+			break;
+
+		case jump:
+			extract_number_and_incr(&mcnt, &p);
+			printf("/jump to %d", p + mcnt - start);
+			break;
+
+		case succeed_n:
+			extract_number_and_incr(&mcnt, &p);
+			p1 = p + mcnt;
+			extract_number_and_incr(&mcnt2, &p);
+			printf("/succeed_n to %d, %d times", p1 - start, mcnt2);
+			break;
+
+		case jump_n:
+			extract_number_and_incr(&mcnt, &p);
+			p1 = p + mcnt;
+			extract_number_and_incr(&mcnt2, &p);
+			printf("/jump_n to %d, %d times", p1 - start, mcnt2);
+			break;
+
+		case set_number_at:
+			extract_number_and_incr(&mcnt, &p);
+			p1 = p + mcnt;
+			extract_number_and_incr(&mcnt2, &p);
+			printf("/set_number_at location %d to %d", p1 - start, mcnt2);
+			break;
+
+		case wordbound:
+			printf("/wordbound");
+			break;
+
+		case notwordbound:
+			printf("/notwordbound");
+			break;
+
+		case wordbeg:
+			printf("/wordbeg");
+			break;
+
+		case wordend:
+			printf("/wordend");
+
+# ifdef emacs
+		case before_dot:
+			printf("/before_dot");
+			break;
+
+		case at_dot:
+			printf("/at_dot");
+			break;
+
+		case after_dot:
+			printf("/after_dot");
+			break;
+
+		case syntaxspec:
+			printf("/syntaxspec");
+			mcnt = *p++;
+			printf("/%d", mcnt);
+			break;
+
+		case notsyntaxspec:
+			printf("/notsyntaxspec");
+			mcnt = *p++;
+			printf("/%d", mcnt);
+			break;
+# endif							/* emacs */
+
+		case wordchar:
+			printf("/wordchar");
+			break;
+
+		case notwordchar:
+			printf("/notwordchar");
+			break;
+
+		case begbuf:
+			printf("/begbuf");
+			break;
+
+		case endbuf:
+			printf("/endbuf");
+			break;
+
+		default:
+			printf("?%d", *(p - 1));
+		}
+
+		putchar('\n');
+	}
+
+	printf("%d:\tend of pattern.\n", p - start);
+}
+
+
+void print_compiled_pattern(bufp)
+struct re_pattern_buffer *bufp;
+{
+	unsigned char *buffer = bufp->buffer;
+
+	print_partial_compiled_pattern(buffer, buffer + bufp->used);
+	printf("%ld bytes used/%ld bytes allocated.\n",
+		   bufp->used, bufp->allocated);
+
+	if (bufp->fastmap_accurate && bufp->fastmap) {
+		printf("fastmap: ");
+		print_fastmap(bufp->fastmap);
+	}
+
+	printf("re_nsub: %d\t", bufp->re_nsub);
+	printf("regs_alloc: %d\t", bufp->regs_allocated);
+	printf("can_be_null: %d\t", bufp->can_be_null);
+	printf("newline_anchor: %d\n", bufp->newline_anchor);
+	printf("no_sub: %d\t", bufp->no_sub);
+	printf("not_bol: %d\t", bufp->not_bol);
+	printf("not_eol: %d\t", bufp->not_eol);
+	printf("syntax: %lx\n", bufp->syntax);
+	/* Perhaps we should print the translate table?  */
+}
+
+
+void print_double_string(where, string1, size1, string2, size2)
+const char *where;
+const char *string1;
+const char *string2;
+int size1;
+int size2;
+{
+	int this_char;
+
+	if (where == NULL)
+		printf("(null)");
+	else {
+		if (FIRST_STRING_P(where)) {
+			for (this_char = where - string1; this_char < size1;
+				 this_char++)
+				putchar(string1[this_char]);
+
+			where = string2;
+		}
+
+		for (this_char = where - string2; this_char < size2; this_char++)
+			putchar(string2[this_char]);
+	}
+}
+
+void printchar(c)
+int c;
+{
+	putc(c, stderr);
+}
+
+#else							/* not DEBUG */
+
+# undef assert
+# define assert(e)
+
+# define DEBUG_STATEMENT(e)
+# define DEBUG_PRINT1(x)
+# define DEBUG_PRINT2(x1, x2)
+# define DEBUG_PRINT3(x1, x2, x3)
+# define DEBUG_PRINT4(x1, x2, x3, x4)
+# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
+# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
+
+#endif							/* not DEBUG */
+
+/* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
+   also be assigned to arbitrarily: each pattern buffer stores its own
+   syntax, so it can be changed between regex compilations.  */
+/* This has no initializer because initialized variables in Emacs
+   become read-only after dumping.  */
+reg_syntax_t re_syntax_options;
+
+
+/* Specify the precise syntax of regexps for compilation.  This provides
+   for compatibility for various utilities which historically have
+   different, incompatible syntaxes.
+
+   The argument SYNTAX is a bit mask comprised of the various bits
+   defined in regex.h.  We return the old syntax.  */
+
+reg_syntax_t re_set_syntax(syntax)
+reg_syntax_t syntax;
+{
+	reg_syntax_t ret = re_syntax_options;
+
+	re_syntax_options = syntax;
+#ifdef DEBUG
+	if (syntax & RE_DEBUG)
+		debug = 1;
+	else if (debug)				/* was on but now is not */
+		debug = 0;
+#endif							/* DEBUG */
+	return ret;
+}
+
+#ifdef _LIBC
+weak_alias(__re_set_syntax, re_set_syntax)
+#endif
+/* This table gives an error message for each of the error codes listed
+   in regex.h.  Obviously the order here has to be same as there.
+   POSIX doesn't require that we do anything for REG_NOERROR,
+   but why not be nice?  */
+static const char re_error_msgid[] = {
+#define REG_NOERROR_IDX	0
+	gettext_noop("Success")		/* REG_NOERROR */
+		"\0"
+#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
+		gettext_noop("No match")	/* REG_NOMATCH */
+		"\0"
+#define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
+		gettext_noop("Invalid regular expression")	/* REG_BADPAT */
+		"\0"
+#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
+		gettext_noop("Invalid collation character")	/* REG_ECOLLATE */
+		"\0"
+#define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
+		gettext_noop("Invalid character class name")	/* REG_ECTYPE */
+		"\0"
+#define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
+		gettext_noop("Trailing backslash")	/* REG_EESCAPE */
+		"\0"
+#define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
+		gettext_noop("Invalid back reference")	/* REG_ESUBREG */
+		"\0"
+#define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
+		gettext_noop("Unmatched [ or [^")	/* REG_EBRACK */
+		"\0"
+#define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
+		gettext_noop("Unmatched ( or \\(")	/* REG_EPAREN */
+		"\0"
+#define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
+		gettext_noop("Unmatched \\{")	/* REG_EBRACE */
+		"\0"
+#define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
+		gettext_noop("Invalid content of \\{\\}")	/* REG_BADBR */
+		"\0"
+#define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
+		gettext_noop("Invalid range end")	/* REG_ERANGE */
+		"\0"
+#define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
+		gettext_noop("Memory exhausted")	/* REG_ESPACE */
+		"\0"
+#define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
+		gettext_noop("Invalid preceding regular expression")	/* REG_BADRPT */
+		"\0"
+#define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
+		gettext_noop("Premature end of regular expression")	/* REG_EEND */
+		"\0"
+#define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
+		gettext_noop("Regular expression too big")	/* REG_ESIZE */
+		"\0"
+#define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
+		gettext_noop("Unmatched ) or \\)")	/* REG_ERPAREN */
+};
+
+static const size_t re_error_msgid_idx[] = {
+	REG_NOERROR_IDX,
+	REG_NOMATCH_IDX,
+	REG_BADPAT_IDX,
+	REG_ECOLLATE_IDX,
+	REG_ECTYPE_IDX,
+	REG_EESCAPE_IDX,
+	REG_ESUBREG_IDX,
+	REG_EBRACK_IDX,
+	REG_EPAREN_IDX,
+	REG_EBRACE_IDX,
+	REG_BADBR_IDX,
+	REG_ERANGE_IDX,
+	REG_ESPACE_IDX,
+	REG_BADRPT_IDX,
+	REG_EEND_IDX,
+	REG_ESIZE_IDX,
+	REG_ERPAREN_IDX
+};
+
+/* Avoiding alloca during matching, to placate r_alloc.  */
+
+/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
+   searching and matching functions should not call alloca.  On some
+   systems, alloca is implemented in terms of malloc, and if we're
+   using the relocating allocator routines, then malloc could cause a
+   relocation, which might (if the strings being searched are in the
+   ralloc heap) shift the data out from underneath the regexp
+   routines.
+
+   Here's another reason to avoid allocation: Emacs
+   processes input from X in a signal handler; processing X input may
+   call malloc; if input arrives while a matching routine is calling
+   malloc, then we're scrod.  But Emacs can't just block input while
+   calling matching routines; then we don't notice interrupts when
+   they come in.  So, Emacs blocks input around all regexp calls
+   except the matching calls, which it leaves unprotected, in the
+   faith that they will not malloc.  */
+
+/* Normally, this is fine.  */
+#define MATCH_MAY_ALLOCATE
+
+/* When using GNU C, we are not REALLY using the C alloca, no matter
+   what config.h may say.  So don't take precautions for it.  */
+#ifdef __GNUC__
+# undef C_ALLOCA
+#endif
+
+/* The match routines may not allocate if (1) they would do it with malloc
+   and (2) it's not safe for them to use malloc.
+   Note that if REL_ALLOC is defined, matching would not use malloc for the
+   failure stack, but we would still use it for the register vectors;
+   so REL_ALLOC should not affect this.  */
+#if (defined C_ALLOCA || defined REGEX_MALLOC) &&a
author	Eric Andersen <andersen@codepoet.org>	2000-10-20 03:48:11 +0000
committer	Eric Andersen <andersen@codepoet.org>	2000-10-20 03:48:11 +0000
commit	82d766043c6a8dcf6283788419f110dd7ab52f80 (patch)
tree	09505131008d1b4d2178065878c3e8e0d54c26a2 /libc/misc
parent	5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff)