A smaller, kinder, gentler regexp implementation.

author: Eric Andersen <andersen@codepoet.org> 2000-10-20 03:48:11 +0000
committer: Eric Andersen <andersen@codepoet.org> 2000-10-20 03:48:11 +0000
commit: 82d766043c6a8dcf6283788419f110dd7ab52f80 (patch)
tree: 09505131008d1b4d2178065878c3e8e0d54c26a2
parent: 5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff)
5 files changed, 6215 insertions, 10978 deletions
diff --git a/include/regex.h b/include/regex.h
index 64a8de685..113a32e72 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -1,1332 +1,62 @@
-#if !defined(_RX_H) || defined(RX_WANT_SE_DEFS)
-#define _RX_H
+/* Definitions for data structures and routines for the regular
+   expression library, version 0.12.
+   Copyright (C) 1985,1989-1993,1995-1998,2000 Free Software Foundation, Inc.
 
-/*	Copyright (C) 1992, 1993 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.  Its master source is NOT part of
+   the C library, however.  The master source lives in /gd/gnu/lib.
 
-This file is part of the librx library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
 
-Librx is free software; you can redistribute it and/or modify it under
-the terms of the GNU Library General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
 
-Librx is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-for more details.
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
 
-You should have received a copy of the GNU Library General Public
-License along with this software; see the file COPYING.LIB.  If not,
-write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
-02139, USA.  */
-/*  t. lord	Wed Sep 23 18:20:57 1992	*/
+#ifndef _REGEX_H
+#define _REGEX_H 1
 
-
-
-
-#include <features.h>
-
-#define __need_size_t
-#include <stddef.h>
-
-#include <string.h>
-
-#if RX_WANT_SE_DEFS != 1
-__BEGIN_DECLS
-#endif
-
-#ifndef RX_WANT_SE_DEFS
-
-/* This page: Bitsets */
-
-#ifndef RX_subset
-typedef unsigned int RX_subset;
-#define RX_subset_bits	(32)
-#define RX_subset_mask	(RX_subset_bits - 1)
-#endif
-
-typedef RX_subset * rx_Bitset;
-
-#ifdef __STDC__
-typedef void (*rx_bitset_iterator) (void *, int member_index);
-#else
-typedef void (*rx_bitset_iterator) ();
-#endif
-
-#define rx_bitset_subset(N)  ((N) / RX_subset_bits)
-#define rx_bitset_subset_val(B,N)  ((B)[rx_bitset_subset(N)])
-#define RX_bitset_access(B,N,OP) \
-  ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
-#define RX_bitset_member(B,N)   RX_bitset_access(B, N, &)
-#define RX_bitset_enjoin(B,N)   RX_bitset_access(B, N, |=)
-#define RX_bitset_remove(B,N)   RX_bitset_access(B, N, &= ~)
-#define RX_bitset_toggle(B,N)   RX_bitset_access(B, N, ^= )
-#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits)
-#define rx_sizeof_bitset(N)	(rx_bitset_numb_subsets(N) * sizeof(RX_subset))
-
-
-
-/* This page: Splay trees. */
-
-#ifdef __STDC__
-typedef int (*rx_sp_comparer) (void * a, void * b);
-#else
-typedef int (*rx_sp_comparer) ();
+/* Allow the use in C++ code.  */
+#ifdef __cplusplus
+extern "C" {
 #endif
 
-struct rx_sp_node 
-{
-  void * key;
-  void * data;
-  struct rx_sp_node * kids[2];
-};
+/* POSIX says that <sys/types.h> must be included (by the caller) before
+   <regex.h>.  */
 
-#ifdef __STDC__
-typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
-#else
-typedef void (*rx_sp_key_data_freer) ();
+#if !defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE && defined VMS
+/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it
+   should be there.  */
+# include <stddef.h>
 #endif
 
-
-/* giant inflatable hash trees */
-
-struct rx_hash_item
-{
-  struct rx_hash_item * next_same_hash;
-  struct rx_hash * table;
-  unsigned long hash;
-  void * data;
-  void * binding;
-};
-
-struct rx_hash
-{
-  struct rx_hash * parent;
-  int refs;
-  struct rx_hash * children[13];
-  struct rx_hash_item * buckets [13];
-  int bucket_size [13];
-};
-
-struct rx_hash_rules;
-
-#ifdef __STDC__
-/* should return like == */
-typedef int (*rx_hash_eq)(void *, void *);
-typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
-typedef void (*rx_free_hash)(struct rx_hash *,
-			    struct rx_hash_rules *);
-typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
-						    void *);
-typedef void (*rx_free_hash_item)(struct rx_hash_item *,
-				 struct rx_hash_rules *);
-#else
-typedef int (*rx_hash_eq)();
-typedef struct rx_hash * (*rx_alloc_hash)();
-typedef void (*rx_free_hash)();
-typedef struct rx_hash_item * (*rx_alloc_hash_item)();
-typedef void (*rx_free_hash_item)();
-#endif
-
-struct rx_hash_rules
-{
-  rx_hash_eq eq;
-  rx_alloc_hash hash_alloc;
-  rx_free_hash free_hash;
-  rx_alloc_hash_item hash_item_alloc;
-  rx_free_hash_item free_hash_item;
-};
-
-
-/* Forward declarations */
-
-struct rx_cache;
-struct rx_superset;
-struct rx;
-struct rx_se_list;
-
-
-
-/* 
- * GLOSSARY
- *
- * regexp
- * regular expression
- * expression
- * pattern - a `regular' expression.  The expression
- *       need not be formally regular -- it can contain
- *       constructs that don't correspond to purely regular
- *       expressions.
- *
- * buffer
- * string - the string (or strings) being searched or matched.
- *
- * pattern buffer - a structure of type `struct re_pattern_buffer'
- *       This in turn contains a `struct rx', which holds the
- *       NFA compiled from a pattern, as well as some of the state
- *       of a matcher using the pattern.
- *
- * NFA - nondeterministic finite automata.  Some people
- *       use this term to a member of the class of 
- *       regular automata (those corresponding to a regular
- *       language).  However, in this code, the meaning is
- *       more general.  The automata used by Rx are comperable
- *       in power to what are usually called `push down automata'.
- *
- *       Two NFA are built by rx for every pattern.  One is built
- *       by the compiler.  The other is built from the first, on
- *       the fly, by the matcher.  The latter is called the `superstate
- *       NFA' because its states correspond to sets of states from
- *       the first NFA.  (Joe Keane gets credit for the name
- *       `superstate NFA').
- *
- * NFA edges
- * epsilon edges
- * side-effect edges - The NFA compiled from a pattern can have three
- *       kinds of edges.  Epsilon edges can be taken freely anytime
- *       their source state is reached.  Character set edges can be
- *       taken when their source state is reached and when the next 
- *       character in the buffer is a member of the set.  Side effect
- *       edges imply a transition that can only be taken after the
- *       indicated side effect has been successfully accomplished.
- *       Some examples of side effects are:
- *
- *		Storing the current match position to record the
- *              location of a parentesized subexpression.
- *
- *              Advancing the matcher over N characters if they
- *              match the N characters previously matched by a 
- *              parentesized subexpression.
- *
- *       Both of those kinds of edges occur in the NFA generated
- *       by the pattern:  \(.\)\1
- *
- *       Epsilon and side effect edges are similar.  Unfortunately,
- *       some of the code uses the name `epsilon edge' to mean
- *       both epsilon and side effect edges.  For example,  the
- *       function has_non_idempotent_epsilon_path computes the existance
- *       of a non-trivial path containing only a mix of epsilon and
- *       side effect edges.  In that case `nonidempotent epsilon' is being
- *       used to mean `side effect'.
- */
-
-
-
-
-
-/* LOW LEVEL PATTERN BUFFERS */
-
-/* Suppose that from some NFA state, more than one path through
- * side-effect edges is possible.  In what order should the paths
- * be tried?  A function of type rx_se_list_order answers that
- * question.  It compares two lists of side effects, and says
- * which list comes first.
- */
- 
-#ifdef __STDC__
-typedef int (*rx_se_list_order) (struct rx *,
-				 struct rx_se_list *, 
-				 struct rx_se_list *);
-#else
-typedef int (*rx_se_list_order) ();
-#endif
-
-
-
-/* Struct RX holds a compiled regular expression - that is, an nfa
- * ready to be converted on demand to a more efficient superstate nfa.
- * This is for the low level interface.  The high-level interfaces enclose
- * this in a `struct re_pattern_buffer'.  
- */
-struct rx
-{
-  /* The compiler assigns a unique id to every pattern.
-   * Like sequence numbers in X, there is a subtle bug here
-   * if you use Rx in a system that runs for a long time.
-   * But, because of the way the caches work out, it is almost
-   * impossible to trigger the Rx version of this bug.
-   *
-   * The id is used to validate superstates found in a cache
-   * of superstates.  It isn't sufficient to let a superstate
-   * point back to the rx for which it was compiled -- the caller
-   * may be re-using a `struct rx' in which case the superstate
-   * is not really valid.  So instead, superstates are validated
-   * by checking the sequence number of the pattern for which
-   * they were built.
-   */
-  int rx_id;
-
-  /* This is memory mgt. state for superstates.  This may be 
-   * shared by more than one struct rx.
-   */
-  struct rx_cache * cache;
-
-  /* Every regex defines the size of its own character set. 
-   * A superstate has an array of this size, with each element
-   * a `struct rx_inx'.  So, don't make this number too large.
-   * In particular, don't make it 2^16.
-   */
-  int local_cset_size;
-
-  /* After the NFA is built, it is copied into a contiguous region
-   * of memory (mostly for compatability with GNU regex).
-   * Here is that region, and it's size:
-   */
-  void * buffer;
-  unsigned long allocated;
-
-  /* Clients of RX can ask for some extra storage in the space pointed
-   * to by BUFFER.  The field RESERVED is an input parameter to the
-   * compiler.  After compilation, this much space will be available 
-   * at (buffer + allocated - reserved)
-   */
-  unsigned long reserved;
-
-  /* --------- The remaining fields are for internal use only. --------- */
-  /* --------- But! they must be initialized to 0.	       --------- */
-
-  /* NODEC is the number of nodes in the NFA with non-epsilon
-   * transitions. 
-   */
-  int nodec;
-
-  /* EPSNODEC is the number of nodes with only epsilon transitions. */
-  int epsnodec;
-
-  /* The sum (NODEC + EPSNODEC) is the total number of states in the
-   * compiled NFA.
-   */
-
-  /* Lists of side effects as stored in the NFA are `hash consed'..meaning
-   * that lists with the same elements are ==.  During compilation, 
-   * this table facilitates hash-consing.
-   */
-  struct rx_hash se_list_memo;
-
-  /* Lists of NFA states are also hashed. 
-   */
-  struct rx_hash set_list_memo;
-
-
-
-
-  /* The compiler and matcher must build a number of instruction frames.
-   * The format of these frames is fixed (c.f. struct rx_inx).  The values
-   * of the instructions is not fixed.
-   *
-   * An enumerated type (enum rx_opcode) defines the set of instructions
-   * that the compiler or matcher might generate.  When filling an instruction
-   * frame, the INX field is found by indexing this instruction table
-   * with an opcode:
-   */
-  void ** instruction_table;
-
-  /* The list of all states in an NFA.
-   * During compilation, the NEXT field of NFA states links this list.
-   * After compilation, all the states are compacted into an array,
-   * ordered by state id numbers.  At that time, this points to the base 
-   * of that array.
-   */
-  struct rx_nfa_state *nfa_states;
-
-  /* Every nfa begins with one distinguished starting state:
-   */
-  struct rx_nfa_state *start;
-
-  /* This orders the search through super-nfa paths.
-   * See the comment near the typedef of rx_se_list_order.
-   */
-  rx_se_list_order se_list_cmp;
-
-  struct rx_superset * start_set;
-};
-
-
-
-
-/* SYNTAX TREES */
-
-/* Compilation is in stages.  
- *
- * In the first stage, a pattern specified by a string is 
- * translated into a syntax tree.  Later stages will convert
- * the syntax tree into an NFA optimized for conversion to a
- * superstate-NFA.
- *
- * This page is about syntax trees.
- */
-
-enum rexp_node_type
-{
-  r_cset,			/* Match from a character set. `a' or `[a-z]'*/
-  r_concat,			/* Concat two subexpressions.   `ab' */
-  r_alternate,			/* Choose one of two subexpressions. `a\|b' */
-  r_opt,			/* Optional subexpression. `a?' */
-  r_star,			/* Repeated subexpression. `a*' */
-
-
-  /* A 2phase-star is a variation on a repeated subexpression.
-   * In this case, there are two subexpressions.  The first, if matched,
-   * begins a repitition (otherwise, the whole expression is matches the
-   * empth string).  
-   * 
-   * After matching the first subexpression, a 2phase star either finishes,
-   * or matches the second subexpression.  If the second subexpression is
-   * matched, then the whole construct repeats.
-   *
-   * 2phase stars are used in two circumstances.  First, they
-   * are used as part of the implementation of POSIX intervals (counted
-   * repititions).  Second, they are used to implement proper star
-   * semantics when the repeated subexpression contains paths of
-   * only side effects.  See rx_compile for more information.
-   */
-  r_2phase_star,
-
-
-  /* c.f. "typedef void * rx_side_effect" */
-  r_side_effect,
-
-  /* This is an extension type:  It is for transient use in source->source
-   * transformations (implemented over syntax trees).
-   */
-  r_data
-};
-
-/* A side effect is a matcher-specific action associated with
- * transitions in the NFA.  The details of side effects are up
- * to the matcher.  To the compiler and superstate constructors
- * side effects are opaque:
- */
-
-typedef void * rx_side_effect;
-
-/* Nodes in a syntax tree are of this type:
- */
-struct rexp_node
-{
-  enum rexp_node_type type;
-  union
-  {
-    rx_Bitset cset;
-    rx_side_effect side_effect;
-    struct
-      {
-	struct rexp_node *left;
-	struct rexp_node *right;
-      } pair;
-    void * data;
-  } params;
-};
-
-
-
-/* NFA
- *
- * A syntax tree is compiled into an NFA.  This page defines the structure
- * of that NFA.
- */
-
-struct rx_nfa_state
-{
-  /* These are kept in a list as the NFA is being built. */
-  struct rx_nfa_state *next;
-
-  /* After the NFA is built, states are given integer id's.
-   * States whose outgoing transitions are all either epsilon or 
-   * side effect edges are given ids less than 0.  Other states
-   * are given successive non-negative ids starting from 0.
-   */
-  int id;
-
-  /* The list of NFA edges that go from this state to some other. */
-  struct rx_nfa_edge *edges;
-
-  /* If you land in this state, then you implicitly land
-   * in all other states reachable by only epsilon translations.
-   * Call the set of maximal paths to such states the epsilon closure
-   * of this state.
-   *
-   * There may be other states that are reachable by a mixture of
-   * epsilon and side effect edges.  Consider the set of maximal paths
-   * of that sort from this state.  Call it the epsilon-side-effect
-   * closure of the state.
-   * 
-   * The epsilon closure of the state is a subset of the epsilon-side-
-   * effect closure.  It consists of all the paths that contain 
-   * no side effects -- only epsilon edges.
-   * 
-   * The paths in the epsilon-side-effect closure  can be partitioned
-   * into equivalance sets. Two paths are equivalant if they have the
-   * same set of side effects, in the same order.  The epsilon-closure
-   * is one of these equivalance sets.  Let's call these equivalance
-   * sets: observably equivalant path sets.  That name is chosen
-   * because equivalance of two paths means they cause the same side
-   * effects -- so they lead to the same subsequent observations other
-   * than that they may wind up in different target states.
-   *
-   * The superstate nfa, which is derived from this nfa, is based on
-   * the observation that all of the paths in an observably equivalant
-   * path set can be explored at the same time, provided that the
-   * matcher keeps track not of a single nfa state, but of a set of
-   * states.   In particular, after following all the paths in an
-   * observably equivalant set, you wind up at a set of target states.
-   * That set of target states corresponds to one state in the
-   * superstate NFA.
-   *
-   * Staticly, before matching begins, it is convenient to analyze the
-   * nfa.  Each state is labeled with a list of the observably
-   * equivalant path sets who's union covers all the
-   * epsilon-side-effect paths beginning in this state.  This list is
-   * called the possible futures of the state.
-   *
-   * A trivial example is this NFA:
-   *             s1
-   *         A  --->  B
-   *
-   *             s2  
-   *            --->  C
-   *
-   *             epsilon           s1
-   *            --------->  D   ------> E
-   * 
-   * 
-   * In this example, A has two possible futures.
-   * One invokes the side effect `s1' and contains two paths,
-   * one ending in state B, the other in state E.
-   * The other invokes the side effect `s2' and contains only
-   * one path, landing in state C.
-   */
-  struct rx_possible_future *futures;
-
-
-  /* There are exactly two distinguished states in every NFA: */
-  unsigned int is_final:1;
-  unsigned int is_start:1;
-
-  /* These are used during NFA construction... */
-  unsigned int eclosure_needed:1;
-  unsigned int mark:1;
-};
-
-
-/* An edge in an NFA is typed: */
-enum rx_nfa_etype
-{
-  /* A cset edge is labled with a set of characters one of which
-   * must be matched for the edge to be taken.
-   */
-  ne_cset,
-
-  /* An epsilon edge is taken whenever its starting state is
-   * reached. 
-   */
-  ne_epsilon,
-
-  /* A side effect edge is taken whenever its starting state is
-   * reached.  Side effects may cause the match to fail or the
-   * position of the matcher to advance.
-   */
-  ne_side_effect		/* A special kind of epsilon. */
-};
-
-struct rx_nfa_edge
-{
-  struct rx_nfa_edge *next;
-  enum rx_nfa_etype type;
-  struct rx_nfa_state *dest;
-  union
-  {
-    rx_Bitset cset;
-    rx_side_effect side_effect;
-  } params;
-};
-
-
-
-/* A possible future consists of a list of side effects
- * and a set of destination states.  Below are their
- * representations.  These structures are hash-consed which
- * means that lists with the same elements share a representation
- * (their addresses are ==).
- */
-
-struct rx_nfa_state_set
-{
-  struct rx_nfa_state * car;
-  struct rx_nfa_state_set * cdr;
-};
-
-struct rx_se_list
-{
-  rx_side_effect car;
-  struct rx_se_list * cdr;
-};
-
-struct rx_possible_future
-{
-  struct rx_possible_future *next;
-  struct rx_se_list * effects;
-  struct rx_nfa_state_set * destset;
-};
-
-
-
-/* This begins the description of the superstate NFA.
- *
- * The superstate NFA corresponds to the NFA in these ways:
- *
- * Every superstate NFA states SUPER correspond to sets of NFA states,
- * nfa_states(SUPER).
- *
- * Superstate edges correspond to NFA paths.
- *
- * The superstate has no epsilon transitions;
- * every edge has a character label, and a (possibly empty) side
- * effect label.   The side effect label corresponds to a list of
- * side effects that occur in the NFA.  These parts are referred
- * to as:   superedge_character(EDGE) and superedge_sides(EDGE).
- *
- * For a superstate edge EDGE starting in some superstate SUPER,
- * the following is true (in pseudo-notation :-):
- *
- *       exists DEST in nfa_states s.t. 
- *         exists nfaEDGE in nfa_edges s.t.
- *                 origin (nfaEDGE) == DEST
- *              && origin (nfaEDGE) is a member of nfa_states(SUPER)
- *              && exists PF in possible_futures(dest(nfaEDGE)) s.t.
- * 	                sides_of_possible_future (PF) == superedge_sides (EDGE)
- *
- * also:
- *
- *      let SUPER2 := superedge_destination(EDGE)
- *          nfa_states(SUPER2)
- *           == union of all nfa state sets S s.t.
- *                          exists PF in possible_futures(dest(nfaEDGE)) s.t.
- * 	                       sides_of_possible_future (PF) == superedge_sides (EDGE)
- *                          && S == dests_of_possible_future (PF) }
- *
- * Or in english, every superstate is a set of nfa states.  A given
- * character and a superstate implies many transitions in the NFA --
- * those that begin with an edge labeled with that character from a
- * state in the set corresponding to the superstate.
- * 
- * The destinations of those transitions each have a set of possible
- * futures.  A possible future is a list of side effects and a set of
- * destination NFA states.  Two sets of possible futures can be
- * `merged' by combining all pairs of possible futures that have the
- * same side effects.  A pair is combined by creating a new future
- * with the same side effect but the union of the two destination sets.
- * In this way, all the possible futures suggested by a superstate
- * and a character can be merged into a set of possible futures where
- * no two elements of the set have the same set of side effects.
- *
- * The destination of a possible future, being a set of NFA states, 
- * corresponds to a supernfa state.  So, the merged set of possible
- * futures we just created can serve as a set of edges in the
- * supernfa.
- *
- * The representation of the superstate nfa and the nfa is critical.
- * The nfa has to be compact, but has to facilitate the rapid
- * computation of missing superstates.  The superstate nfa has to 
- * be fast to interpret, lazilly constructed, and bounded in space.
- *
- * To facilitate interpretation, the superstate data structures are 
- * peppered with `instruction frames'.  There is an instruction set
- * defined below which matchers using the supernfa must be able to
- * interpret.
- *
- * We'd like to make it possible but not mandatory to use code
- * addresses to represent instructions (c.f. gcc's computed goto).
- * Therefore, we define an enumerated type of opcodes, and when
- * writing one of these instructions into a data structure, use
- * the opcode as an index into a table of instruction values.
- * 
- * Here are the opcodes that occur in the superstate nfa:
- */
- 
-
-/* Every superstate contains a table of instruction frames indexed 
- * by characters.  A normal `move' in a matcher is to fetch the next
- * character and use it as an index into a superstates transition
- * table.
- *
- * In the fasted case, only one edge follows from that character.
- * In other cases there is more work to do.
- * 
- * The descriptions of the opcodes refer to data structures that are
- * described further below. 
- */
-
-enum rx_opcode
-{
-  /* 
-   * BACKTRACK_POINT is invoked when a character transition in 
-   * a superstate leads to more than one edge.  In that case,
-   * the edges have to be explored independently using a backtracking
-   * strategy.
-   *
-   * A BACKTRACK_POINT instruction is stored in a superstate's 
-   * transition table for some character when it is known that that
-   * character crosses more than one edge.  On encountering this
-   * instruction, the matcher saves enough state to backtrack to this
-   * point in the match later.
-   */
-  rx_backtrack_point = 0,	/* data is (struct transition_class *) */
-
-  /* 
-   * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
-   * There is one occurence of this instruction per rx_distinct_future.
-   * This instruction is skipped if a rx_distinct_future has no side effects.
-   */
-  rx_do_side_effects = rx_backtrack_point + 1,
-
-  /* data is (struct rx_distinct_future *) */
-
-  /* 
-   * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
-   * destination superstate has been reclaimed (or was never built).
-   * It recomputes the destination superstate.
-   * RX_CACHE_MISS is also stored in a superstate transition table before
-   * any of its edges have been built.
-   */
-  rx_cache_miss = rx_do_side_effects + 1,
-  /* data is (struct rx_distinct_future *) */
-
-  /* 
-   * RX_NEXT_CHAR is called to consume the next character and take the
-   * corresponding transition.  This is the only instruction that uses 
-   * the DATA field of the instruction frame instead of DATA_2.
-   * (see EXPLORE_FUTURE in regex.c).
-   */
-  rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
-
-  /* RX_BACKTRACK indicates that a transition fails.
-   */
-  rx_backtrack = rx_next_char + 1, /* no data */
-
-  /* 
-   * RX_ERROR_INX is stored only in places that should never be executed.
-   */
-  rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
-
-  rx_num_instructions = rx_error_inx + 1
-};
-
-/* An id_instruction_table holds the values stored in instruction
- * frames.  The table is indexed by the enums declared above.
- */
-extern void * rx_id_instruction_table[rx_num_instructions];
-
-/* The heart of the matcher is a `word-code-interpreter' 
- * (like a byte-code interpreter, except that instructions
- * are a full word wide).
- *
- * Instructions are not stored in a vector of code, instead,
- * they are scattered throughout the data structures built
- * by the regexp compiler and the matcher.  One word-code instruction,
- * together with the arguments to that instruction, constitute
- * an instruction frame (struct rx_inx).
- *
- * This structure type is padded by hand to a power of 2 because
- * in one of the dominant cases, we dispatch by indexing a table
- * of instruction frames.  If that indexing can be accomplished
- * by just a shift of the index, we're happy.
- *
- * Instructions take at most one argument, but there are two
- * slots in an instruction frame that might hold that argument.
- * These are called data and data_2.  The data slot is only
- * used for one instruction (RX_NEXT_CHAR).  For all other 
- * instructions, data should be set to 0.
- *
- * RX_NEXT_CHAR is the most important instruction by far.
- * By reserving the data field for its exclusive use, 
- * instruction dispatch is sped up in that case.  There is
- * no need to fetch both the instruction and the data,
- * only the data is needed.  In other words, a `cycle' begins
- * by fetching the field data.  If that is non-0, then it must
- * be the destination state of a next_char transition, so
- * make that value the current state, advance the match position
- * by one character, and start a new cycle.  On the other hand,
- * if data is 0, fetch the instruction and do a more complicated
- * dispatch on that.
- */
-
-struct rx_inx 
-{
-  void * data;
-  void * data_2;
-  void * inx;
-  void * fnord;
-};
-
-#ifndef RX_TAIL_ARRAY
-#define RX_TAIL_ARRAY  1
-#endif
-
-/* A superstate corresponds to a set of nfa states.  Those sets are
- * represented by STRUCT RX_SUPERSET.  The constructors
- * guarantee that only one (shared) structure is created for a given set.
- */
-struct rx_superset
-{
-  int refs;			/* This is a reference counted structure. */
-
-  /* We keep these sets in a cache because (in an unpredictable way),
-   * the same set is often created again and again.  But that is also
-   * problematic -- compatibility with POSIX and GNU regex requires
-   * that we not be able to tell when a program discards a particular
-   * NFA (thus invalidating the supersets created from it).
-   *
-   * But when a cache hit appears to occur, we will have in hand the
-   * nfa for which it may have happened.  That is why every nfa is given
-   * its own sequence number.  On a cache hit, the cache is validated
-   * by comparing the nfa sequence number to this field:
-   */
-  int id;
-
-  struct rx_nfa_state * car;	/* May or may not be a valid addr. */
-  struct rx_superset * cdr;
-
-  /* If the corresponding superstate exists: */
-  struct rx_superstate * superstate;
-
-
-  /* There is another bookkeeping problem.  It is expensive to 
-   * compute the starting nfa state set for an nfa.  So, once computed,
-   * it is cached in the `struct rx'.
-   *
-   * But, the state set can be flushed from the superstate cache.
-   * When that happens, we can't know if the corresponding `struct rx'
-   * is still alive or if it has been freed or re-used by the program.
-   * So, the cached pointer to this set in a struct rx might be invalid
-   * and we need a way to validate it.
-   *
-   * Fortunately, even if this set is flushed from the cache, it is
-   * not freed.  It just goes on the free-list of supersets.
-   * So we can still examine it.  
-   *
-   * So to validate a starting set memo, check to see if the
-   * starts_for field still points back to the struct rx in question,
-   * and if the ID matches the rx sequence number.
-   */
-  struct rx * starts_for;
-
-  /* This is used to link into a hash bucket so these objects can
-   * be `hash-consed'.
-   */
-  struct rx_hash_item hash_item;
-};
-
-#define rx_protect_superset(RX,CON) (++(CON)->refs)
-
-/* The terminology may be confusing (rename this structure?).
- * Every character occurs in at most one rx_super_edge per super-state.
- * But, that structure might have more than one option, indicating a point
- * of non-determinism. 
- *
- * In other words, this structure holds a list of superstate edges
- * sharing a common starting state and character label.  The edges
- * are in the field OPTIONS.  All superstate edges sharing the same
- * starting state and character are in this list.
- */
-struct rx_super_edge
-{
-  struct rx_super_edge *next;
-  struct rx_inx rx_backtrack_frame;
-  int cset_size;
-  rx_Bitset cset;
-  struct rx_distinct_future *options;
-};
-
-/* A superstate is a set of nfa states (RX_SUPERSET) along
- * with a transition table.  Superstates are built on demand and reclaimed
- * without warning.  To protect a superstate from this ghastly fate,
- * use LOCK_SUPERSTATE. 
- */
-struct rx_superstate
-{
-  int rx_id;			/* c.f. the id field of rx_superset */
-  int locks;			/* protection from reclamation */
-
-  /* Within a superstate cache, all the superstates are kept in a big
-   * queue.  The tail of the queue is the state most likely to be
-   * reclaimed.  The *recyclable fields hold the queue position of 
-   * this state.
-   */
-  struct rx_superstate * next_recyclable;
-  struct rx_superstate * prev_recyclable;
-
-  /* The supernfa edges that exist in the cache and that have
-   * this state as their destination are kept in this list:
-   */
-  struct rx_distinct_future * transition_refs;
-
-  /* The list of nfa states corresponding to this superstate: */
-  struct rx_superset * contents;
-
-  /* The list of edges in the cache beginning from this state. */
-  struct rx_super_edge * edges;
-
-  /* A tail of the recyclable queue is marked as semifree.  A semifree
-   * state has no incoming next_char transitions -- any transition
-   * into a semifree state causes a complex dispatch with the side
-   * effect of rescuing the state from its semifree state.
-   *
-   * An alternative to this might be to make next_char more expensive,
-   * and to move a state to the head of the recyclable queue whenever
-   * it is entered.  That way, popular states would never be recycled.
-   *
-   * But unilaterally making next_char more expensive actually loses.
-   * So, incoming transitions are only made expensive for states near
-   * the tail of the recyclable queue.  The more cache contention
-   * there is, the more frequently a state will have to prove itself
-   * and be moved back to the front of the queue.  If there is less 
-   * contention, then popular states just aggregate in the front of 
-   * the queue and stay there.
-   */
-  int is_semifree;
-
-
-  /* This keeps track of the size of the transition table for this
-   * state.  There is a half-hearted attempt to support variable sized
-   * superstates.
-   */
-  int trans_size;
-
-  /* Indexed by characters... */
-  struct rx_inx transitions[RX_TAIL_ARRAY];
-};
-
-
-/* A list of distinct futures define the edges that leave from a 
- * given superstate on a given character.  c.f. rx_super_edge.
- */
-
-struct rx_distinct_future
-{
-  struct rx_distinct_future * next_same_super_edge[2];
-  struct rx_distinct_future * next_same_dest;
-  struct rx_distinct_future * prev_same_dest;
-  struct rx_superstate * present;	/* source state */
-  struct rx_superstate * future;	/* destination state */
-  struct rx_super_edge * edge;
-
-
-  /* The future_frame holds the instruction that should be executed
-   * after all the side effects are done, when it is time to complete
-   * the transition to the next state.
-   *
-   * Normally this is a next_char instruction, but it may be a
-   * cache_miss instruction as well, depending on whether or not
-   * the superstate is in the cache and semifree.
-   * 
-   * If this is the only future for a given superstate/char, and
-   * if there are no side effects to be performed, this frame is
-   * not used (directly) at all.  Instead, its contents are copied
-   * into the transition table of the starting state of this dist. future.
-   */
-  struct rx_inx future_frame;
-
-  struct rx_inx side_effects_frame;
-  struct rx_se_list * effects;
-};
-
-#define rx_lock_superstate(R,S)  ((S)->locks++)
-#define rx_unlock_superstate(R,S) (--(S)->locks)
-
-
-/* This page destined for rx.h */
-
-struct rx_blocklist
-{
-  struct rx_blocklist * next;
-  int bytes;
-};
-
-struct rx_freelist
-{
-  struct rx_freelist * next;
-};
-
-struct rx_cache;
-
-#ifdef __STDC__
-typedef void (*rx_morecore_fn)(struct rx_cache *);
-#else
-typedef void (*rx_morecore_fn)();
-#endif
-
-/* You use this to control the allocation of superstate data 
- * during matching.  Most of it should be initialized to 0.
- *
- * A MORECORE function is necessary.  It should allocate
- * a new block of memory or return 0.
- * A default that uses malloc is called `rx_morecore'.
- *
- * The number of SUPERSTATES_ALLOWED indirectly limits how much memory
- * the system will try to allocate.  The default is 128.  Batch style
- * applications that are very regexp intensive should use as high a number
- * as possible without thrashing.
- * 
- * The LOCAL_CSET_SIZE is the number of characters in a character set.
- * It is therefore the number of entries in a superstate transition table.
- * Generally, it should be 256.  If your character set has 16 bits, 
- * it is better to translate your regexps into equivalent 8 bit patterns.
- */
-
-struct rx_cache
-{
-  struct rx_hash_rules superset_hash_rules;
-
-  /* Objects are allocated by incrementing a pointer that
author	Eric Andersen <andersen@codepoet.org>	2000-10-20 03:48:11 +0000
committer	Eric Andersen <andersen@codepoet.org>	2000-10-20 03:48:11 +0000
commit	82d766043c6a8dcf6283788419f110dd7ab52f80 (patch)
tree	09505131008d1b4d2178065878c3e8e0d54c26a2
parent	5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff)