1 files changed, 3754 insertions, 0 deletions
diff --git a/include/regex.h b/include/regex.h
new file mode 100644
index 000000000..813882c42
--- /dev/null
+++ b/include/regex.h
@@ -0,0 +1,3754 @@
+#if !defined(_RX_H) || defined(RX_WANT_SE_DEFS)
+#define _RX_H
+
+/*	Copyright (C) 1992, 1993 Free Software Foundation, Inc.
+
+This file is part of the librx library.
+
+Librx is free software; you can redistribute it and/or modify it under
+the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+Librx is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with this software; see the file COPYING.LIB.  If not,
+write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
+02139, USA.  */
+/*  t. lord	Wed Sep 23 18:20:57 1992	*/
+
+
+
+
+#include <features.h>
+
+#define __need_size_t
+#include <stddef.h>
+
+#include <string.h>
+
+#if RX_WANT_SE_DEFS != 1
+__BEGIN_DECLS
+#endif
+
+#ifndef RX_WANT_SE_DEFS
+
+/* This page: Bitsets */
+
+#ifndef RX_subset
+typedef unsigned int RX_subset;
+#define RX_subset_bits	(32)
+#define RX_subset_mask	(RX_subset_bits - 1)
+#endif
+
+typedef RX_subset * rx_Bitset;
+
+#ifdef __STDC__
+typedef void (*rx_bitset_iterator) (void *, int member_index);
+#else
+typedef void (*rx_bitset_iterator) ();
+#endif
+
+#define rx_bitset_subset(N)  ((N) / RX_subset_bits)
+#define rx_bitset_subset_val(B,N)  ((B)[rx_bitset_subset(N)])
+#define RX_bitset_access(B,N,OP) \
+  ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
+#define RX_bitset_member(B,N)   RX_bitset_access(B, N, &)
+#define RX_bitset_enjoin(B,N)   RX_bitset_access(B, N, |=)
+#define RX_bitset_remove(B,N)   RX_bitset_access(B, N, &= ~)
+#define RX_bitset_toggle(B,N)   RX_bitset_access(B, N, ^= )
+#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits)
+#define rx_sizeof_bitset(N)	(rx_bitset_numb_subsets(N) * sizeof(RX_subset))
+
+
+
+/* This page: Splay trees. */
+
+#ifdef __STDC__
+typedef int (*rx_sp_comparer) (void * a, void * b);
+#else
+typedef int (*rx_sp_comparer) ();
+#endif
+
+struct rx_sp_node 
+{
+  void * key;
+  void * data;
+  struct rx_sp_node * kids[2];
+};
+
+#ifdef __STDC__
+typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
+#else
+typedef void (*rx_sp_key_data_freer) ();
+#endif
+
+
+/* giant inflatable hash trees */
+
+struct rx_hash_item
+{
+  struct rx_hash_item * next_same_hash;
+  struct rx_hash * table;
+  unsigned long hash;
+  void * data;
+  void * binding;
+};
+
+struct rx_hash
+{
+  struct rx_hash * parent;
+  int refs;
+  struct rx_hash * children[13];
+  struct rx_hash_item * buckets [13];
+  int bucket_size [13];
+};
+
+struct rx_hash_rules;
+
+#ifdef __STDC__
+/* should return like == */
+typedef int (*rx_hash_eq)(void *, void *);
+typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
+typedef void (*rx_free_hash)(struct rx_hash *,
+			    struct rx_hash_rules *);
+typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
+						    void *);
+typedef void (*rx_free_hash_item)(struct rx_hash_item *,
+				 struct rx_hash_rules *);
+#else
+typedef int (*rx_hash_eq)();
+typedef struct rx_hash * (*rx_alloc_hash)();
+typedef void (*rx_free_hash)();
+typedef struct rx_hash_item * (*rx_alloc_hash_item)();
+typedef void (*rx_free_hash_item)();
+#endif
+
+struct rx_hash_rules
+{
+  rx_hash_eq eq;
+  rx_alloc_hash hash_alloc;
+  rx_free_hash free_hash;
+  rx_alloc_hash_item hash_item_alloc;
+  rx_free_hash_item free_hash_item;
+};
+
+
+/* Forward declarations */
+
+struct rx_cache;
+struct rx_superset;
+struct rx;
+struct rx_se_list;
+
+
+
+/* 
+ * GLOSSARY
+ *
+ * regexp
+ * regular expression
+ * expression
+ * pattern - a `regular' expression.  The expression
+ *       need not be formally regular -- it can contain
+ *       constructs that don't correspond to purely regular
+ *       expressions.
+ *
+ * buffer
+ * string - the string (or strings) being searched or matched.
+ *
+ * pattern buffer - a structure of type `struct re_pattern_buffer'
+ *       This in turn contains a `struct rx', which holds the
+ *       NFA compiled from a pattern, as well as some of the state
+ *       of a matcher using the pattern.
+ *
+ * NFA - nondeterministic finite automata.  Some people
+ *       use this term to a member of the class of 
+ *       regular automata (those corresponding to a regular
+ *       language).  However, in this code, the meaning is
+ *       more general.  The automata used by Rx are comperable
+ *       in power to what are usually called `push down automata'.
+ *
+ *       Two NFA are built by rx for every pattern.  One is built
+ *       by the compiler.  The other is built from the first, on
+ *       the fly, by the matcher.  The latter is called the `superstate
+ *       NFA' because its states correspond to sets of states from
+ *       the first NFA.  (Joe Keane gets credit for the name
+ *       `superstate NFA').
+ *
+ * NFA edges
+ * epsilon edges
+ * side-effect edges - The NFA compiled from a pattern can have three
+ *       kinds of edges.  Epsilon edges can be taken freely anytime
+ *       their source state is reached.  Character set edges can be
+ *       taken when their source state is reached and when the next 
+ *       character in the buffer is a member of the set.  Side effect
+ *       edges imply a transition that can only be taken after the
+ *       indicated side effect has been successfully accomplished.
+ *       Some examples of side effects are:
+ *
+ *		Storing the current match position to record the
+ *              location of a parentesized subexpression.
+ *
+ *              Advancing the matcher over N characters if they
+ *              match the N characters previously matched by a 
+ *              parentesized subexpression.
+ *
+ *       Both of those kinds of edges occur in the NFA generated
+ *       by the pattern:  \(.\)\1
+ *
+ *       Epsilon and side effect edges are similar.  Unfortunately,
+ *       some of the code uses the name `epsilon edge' to mean
+ *       both epsilon and side effect edges.  For example,  the
+ *       function has_non_idempotent_epsilon_path computes the existance
+ *       of a non-trivial path containing only a mix of epsilon and
+ *       side effect edges.  In that case `nonidempotent epsilon' is being
+ *       used to mean `side effect'.
+ */
+
+
+
+
+
+/* LOW LEVEL PATTERN BUFFERS */
+
+/* Suppose that from some NFA state, more than one path through
+ * side-effect edges is possible.  In what order should the paths
+ * be tried?  A function of type rx_se_list_order answers that
+ * question.  It compares two lists of side effects, and says
+ * which list comes first.
+ */
+ 
+#ifdef __STDC__
+typedef int (*rx_se_list_order) (struct rx *,
+				 struct rx_se_list *, 
+				 struct rx_se_list *);
+#else
+typedef int (*rx_se_list_order) ();
+#endif
+
+
+
+/* Struct RX holds a compiled regular expression - that is, an nfa
+ * ready to be converted on demand to a more efficient superstate nfa.
+ * This is for the low level interface.  The high-level interfaces enclose
+ * this in a `struct re_pattern_buffer'.  
+ */
+struct rx
+{
+  /* The compiler assigns a unique id to every pattern.
+   * Like sequence numbers in X, there is a subtle bug here
+   * if you use Rx in a system that runs for a long time.
+   * But, because of the way the caches work out, it is almost
+   * impossible to trigger the Rx version of this bug.
+   *
+   * The id is used to validate superstates found in a cache
+   * of superstates.  It isn't sufficient to let a superstate
+   * point back to the rx for which it was compiled -- the caller
+   * may be re-using a `struct rx' in which case the superstate
+   * is not really valid.  So instead, superstates are validated
+   * by checking the sequence number of the pattern for which
+   * they were built.
+   */
+  int rx_id;
+
+  /* This is memory mgt. state for superstates.  This may be 
+   * shared by more than one struct rx.
+   */
+  struct rx_cache * cache;
+
+  /* Every regex defines the size of its own character set. 
+   * A superstate has an array of this size, with each element
+   * a `struct rx_inx'.  So, don't make this number too large.
+   * In particular, don't make it 2^16.
+   */
+  int local_cset_size;
+
+  /* After the NFA is built, it is copied into a contiguous region
+   * of memory (mostly for compatability with GNU regex).
+   * Here is that region, and it's size:
+   */
+  void * buffer;
+  unsigned long allocated;
+
+  /* Clients of RX can ask for some extra storage in the space pointed
+   * to by BUFFER.  The field RESERVED is an input parameter to the
+   * compiler.  After compilation, this much space will be available 
+   * at (buffer + allocated - reserved)
+   */
+  unsigned long reserved;
+
+  /* --------- The remaining fields are for internal use only. --------- */
+  /* --------- But! they must be initialized to 0.	       --------- */
+
+  /* NODEC is the number of nodes in the NFA with non-epsilon
+   * transitions. 
+   */
+  int nodec;
+
+  /* EPSNODEC is the number of nodes with only epsilon transitions. */
+  int epsnodec;
+
+  /* The sum (NODEC + EPSNODEC) is the total number of states in the
+   * compiled NFA.
+   */
+
+  /* Lists of side effects as stored in the NFA are `hash consed'..meaning
+   * that lists with the same elements are ==.  During compilation, 
+   * this table facilitates hash-consing.
+   */
+  struct rx_hash se_list_memo;
+
+  /* Lists of NFA states are also hashed. 
+   */
+  struct rx_hash set_list_memo;
+
+
+
+
+  /* The compiler and matcher must build a number of instruction frames.
+   * The format of these frames is fixed (c.f. struct rx_inx).  The values
+   * of the instructions is not fixed.
+   *
+   * An enumerated type (enum rx_opcode) defines the set of instructions
+   * that the compiler or matcher might generate.  When filling an instruction
+   * frame, the INX field is found by indexing this instruction table
+   * with an opcode:
+   */
+  void ** instruction_table;
+
+  /* The list of all states in an NFA.
+   * During compilation, the NEXT field of NFA states links this list.
+   * After compilation, all the states are compacted into an array,
+   * ordered by state id numbers.  At that time, this points to the base 
+   * of that array.
+   */
+  struct rx_nfa_state *nfa_states;
+
+  /* Every nfa begins with one distinguished starting state:
+   */
+  struct rx_nfa_state *start;
+
+  /* This orders the search through super-nfa paths.
+   * See the comment near the typedef of rx_se_list_order.
+   */
+  rx_se_list_order se_list_cmp;
+
+  struct rx_superset * start_set;
+};
+
+
+
+
+/* SYNTAX TREES */
+
+/* Compilation is in stages.  
+ *
+ * In the first stage, a pattern specified by a string is 
+ * translated into a syntax tree.  Later stages will convert
+ * the syntax tree into an NFA optimized for conversion to a
+ * superstate-NFA.
+ *
+ * This page is about syntax trees.
+ */
+
+enum rexp_node_type
+{
+  r_cset,			/* Match from a character set. `a' or `[a-z]'*/
+  r_concat,			/* Concat two subexpressions.   `ab' */
+  r_alternate,			/* Choose one of two subexpressions. `a\|b' */
+  r_opt,			/* Optional subexpression. `a?' */
+  r_star,			/* Repeated subexpression. `a*' */
+
+
+  /* A 2phase-star is a variation on a repeated subexpression.
+   * In this case, there are two subexpressions.  The first, if matched,
+   * begins a repitition (otherwise, the whole expression is matches the
+   * empth string).  
+   * 
+   * After matching the first subexpression, a 2phase star either finishes,
+   * or matches the second subexpression.  If the second subexpression is
+   * matched, then the whole construct repeats.
+   *
+   * 2phase stars are used in two circumstances.  First, they
+   * are used as part of the implementation of POSIX intervals (counted
+   * repititions).  Second, they are used to implement proper star
+   * semantics when the repeated subexpression contains paths of
+   * only side effects.  See rx_compile for more information.
+   */
+  r_2phase_star,
+
+
+  /* c.f. "typedef void * rx_side_effect" */
+  r_side_effect,
+
+  /* This is an extension type:  It is for transient use in source->source
+   * transformations (implemented over syntax trees).
+   */
+  r_data
+};
+
+/* A side effect is a matcher-specific action associated with
+ * transitions in the NFA.  The details of side effects are up
+ * to the matcher.  To the compiler and superstate constructors
+ * side effects are opaque:
+ */
+
+typedef void * rx_side_effect;
+
+/* Nodes in a syntax tree are of this type:
+ */
+struct rexp_node
+{
+  enum rexp_node_type type;
+  union
+  {
+    rx_Bitset cset;
+    rx_side_effect side_effect;
+    struct
+      {
+	struct rexp_node *left;
+	struct rexp_node *right;
+      } pair;
+    void * data;
+  } params;
+};
+
+
+
+/* NFA
+ *
+ * A syntax tree is compiled into an NFA.  This page defines the structure
+ * of that NFA.
+ */
+
+struct rx_nfa_state
+{
+  /* These are kept in a list as the NFA is being built. */
+  struct rx_nfa_state *next;
+
+  /* After the NFA is built, states are given integer id's.
+   * States whose outgoing transitions are all either epsilon or 
+   * side effect edges are given ids less than 0.  Other states
+   * are given successive non-negative ids starting from 0.
+   */
+  int id;
+
+  /* The list of NFA edges that go from this state to some other. */
+  struct rx_nfa_edge *edges;
+
+  /* If you land in this state, then you implicitly land
+   * in all other states reachable by only epsilon translations.
+   * Call the set of maximal paths to such states the epsilon closure
+   * of this state.
+   *
+   * There may be other states that are reachable by a mixture of
+   * epsilon and side effect edges.  Consider the set of maximal paths
+   * of that sort from this state.  Call it the epsilon-side-effect
+   * closure of the state.
+   * 
+   * The epsilon closure of the state is a subset of the epsilon-side-
+   * effect closure.  It consists of all the paths that contain 
+   * no side effects -- only epsilon edges.
+   * 
+   * The paths in the epsilon-side-effect closure  can be partitioned
+   * into equivalance sets. Two paths are equivalant if they have the
+   * same set of side effects, in the same order.  The epsilon-closure
+   * is one of these equivalance sets.  Let's call these equivalance
+   * sets: observably equivalant path sets.  That name is chosen
+   * because equivalance of two paths means they cause the same side
+   * effects -- so they lead to the same subsequent observations other
+   * than that they may wind up in different target states.
+   *
+   * The superstate nfa, which is derived from this nfa, is based on
+   * the observation that all of the paths in an observably equivalant
+   * path set can be explored at the same time, provided that the
+   * matcher keeps track not of a single nfa state, but of a set of
+   * states.   In particular, after following all the paths in an
+   * observably equivalant set, you wind up at a set of target states.
+   * That set of target states corresponds to one state in the
+   * superstate NFA.
+   *
+   * Staticly, before matching begins, it is convenient to analyze the
+   * nfa.  Each state is labeled with a list of the observably
+   * equivalant path sets who's union covers all the
+   * epsilon-side-effect paths beginning in this state.  This list is
+   * called the possible futures of the state.
+   *
+   * A trivial example is this NFA:
+   *             s1
+   *         A  --->  B
+   *
+   *             s2  
+   *            --->  C
+   *
+   *             epsilon           s1
+   *            --------->  D   ------> E
+   * 
+   * 
+   * In this example, A has two possible futures.
+   * One invokes the side effect `s1' and contains two paths,
+   * one ending in state B, the other in state E.
+   * The other invokes the side effect `s2' and contains only
+   * one path, landing in state C.
+   */
+  struct rx_possible_future *futures;
+
+
+  /* There are exactly two distinguished states in every NFA: */
+  unsigned int is_final:1;
+  unsigned int is_start:1;
+
+  /* These are used during NFA construction... */
+  unsigned int eclosure_needed:1;
+  unsigned int mark:1;
+};
+
+
+/* An edge in an NFA is typed: */
+enum rx_nfa_etype
+{
+  /* A cset edge is labled with a set of characters one of which
+   * must be matched for the edge to be taken.
+   */
+  ne_cset,
+
+  /* An epsilon edge is taken whenever its starting state is
+   * reached. 
+   */
+  ne_epsilon,
+
+  /* A side effect edge is taken whenever its starting state is
+   * reached.  Side effects may cause the match to fail or the
+   * position of the matcher to advance.
+   */
+  ne_side_effect		/* A special kind of epsilon. */
+};
+
+struct rx_nfa_edge
+{
+  struct rx_nfa_edge *next;
+  enum rx_nfa_etype type;
+  struct rx_nfa_state *dest;
+  union
+  {
+    rx_Bitset cset;
+    rx_side_effect side_effect;
+  } params;
+};
+
+
+
+/* A possible future consists of a list of side effects
+ * and a set of destination states.  Below are their
+ * representations.  These structures are hash-consed which
+ * means that lists with the same elements share a representation
+ * (their addresses are ==).
+ */
+
+struct rx_nfa_state_set
+{
+  struct rx_nfa_state * car;
+  struct rx_nfa_state_set * cdr;
+};
+
+struct rx_se_list
+{
+  rx_side_effect car;
+  struct rx_se_list * cdr;
+};
+
+struct rx_possible_future
+{
+  struct rx_possible_future *next;
+  struct rx_se_list * effects;
+  struct rx_nfa_state_set * destset;
+};
+
+
+
+/* This begins the description of the superstate NFA.
+ *
+ * The superstate NFA corresponds to the NFA in these ways:
+ *
+ * Every superstate NFA states SUPER correspond to sets of NFA states,
+ * nfa_states(SUPER).
+ *
+ * Superstate edges correspond to NFA paths.
+ *
+ * The superstate has no epsilon transitions;
+ * every edge has a character label, and a (possibly empty) side
+ * effect label.   The side effect label corresponds to a list of
+ * side effects that occur in the NFA.  These parts are referred
+ * to as:   superedge_character(EDGE) and superedge_sides(EDGE).
+ *
+ * For a superstate edge EDGE starting in some superstate SUPER,
+ * the following is true (in pseudo-notation :-):
+ *
+ *       exists DEST in nfa_states s.t. 
+ *         exists nfaEDGE in nfa_edges s.t.
+ *                 origin (nfaEDGE) == DEST
+ *              && origin (nfaEDGE) is a member of nfa_states(SUPER)
+ *              && exists PF in possible_futures(dest(nfaEDGE)) s.t.
+ * 	                sides_of_possible_future (PF) == superedge_sides (EDGE)
+ *
+ * also:
+ *
+ *      let SUPER2 := superedge_destination(EDGE)
+ *          nfa_states(SUPER2)
+ *           == union of all nfa state sets S s.t.
+ *                          exists PF in possible_futures(dest(nfaEDGE)) s.t.
+ * 	                       sides_of_possible_future (PF) == superedge_sides (EDGE)
+ *                          && S == dests_of_possible_future (PF) }
+ *
+ * Or in english, every superstate is a set of nfa states.  A given
+ * character and a superstate implies many transitions in the NFA --
+ * those that begin with an edge labeled with that character from a
+ * state in the set corresponding to the superstate.
+ * 
+ * The destinations of those transitions each have a set of possible
+ * futures.  A possible future is a list of side effects and a set of
+ * destination NFA states.  Two sets of possible futures can be
+ * `merged' by combining all pairs of possible futures that have the
+ * same side effects.  A pair is combined by creating a new future
+ * with the same side effect but the union of the two destination sets.
+ * In this way, all the possible futures suggested by a superstate
+ * and a character can be merged into a set of possible futures where
+ * no two elements of the set have the same set of side effects.
+ *
+ * The destination of a possible future, being a set of NFA states, 
+ * corresponds to a supernfa state.  So, the merged set of possible
+ * futures we just created can serve as a set of edges in the
+ * supernfa.
+ *
+ * The representation of the superstate nfa and the nfa is critical.
+ * The nfa has to be compact, but has to facilitate the rapid
+ * computation of missing superstates.  The superstate nfa has to 
+ * be fast to interpret, lazilly constructed, and bounded in space.
+ *
+ * To facilitate interpretation, the superstate data structures are 
+ * peppered with `instruction frames'.  There is an instruction set
+ * defined below which matchers using the supernfa must be able to
+ * interpret.
+ *
+ * We'd like to make it possible but not mandatory to use code
+ * addresses to represent instructions (c.f. gcc's computed goto).
+ * Therefore, we define an enumerated type of opcodes, and when
+ * writing one of these instructions into a data structure, use
+ * the opcode as an index into a table of instruction values.
+ * 
+ * Here are the opcodes that occur in the superstate nfa:
+ */
+ 
+
+/* Every superstate contains a table of instruction frames indexed 
+ * by characters.  A normal `move' in a matcher is to fetch the next
+ * character and use it as an index into a superstates transition
+ * table.
+ *
+ * In the fasted case, only one edge follows from that character.
+ * In other cases there is more work to do.
+ * 
+ * The descriptions of the opcodes refer to data structures that are
+ * described further below. 
+ */
+
+enum rx_opcode
+{
+  /* 
+   * BACKTRACK_POINT is invoked when a character transition in 
+   * a superstate leads to more than one edge.  In that case,
+   * the edges have to be explored independently using a backtracking
+   * strategy.
+   *
+   * A BACKTRACK_POINT instruction is stored in a superstate's 
+   * transition table for some character when it is known that that
+   * character crosses more than one edge.  On encountering this
+   * instruction, the matcher saves enough state to backtrack to this
+   * point in the match later.
+   */
+  rx_backtrack_point = 0,	/* data is (struct transition_class *) */
+
+  /* 
+   * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
+   * There is one occurence of this instruction per rx_distinct_future.
+   * This instruction is skipped if a rx_distinct_future has no side effects.
+   */
+  rx_do_side_effects = rx_backtrack_point + 1,
+
+  /* data is (struct rx_distinct_future *) */
+
+  /* 
+   * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
+   * destination superstate has been reclaimed (or was never built).
+   * It recomputes the destination superstate.
+   * RX_CACHE_MISS is also stored in a superstate transition table before
+   * any of its edges have been built.
+   */
+  rx_cache_miss = rx_do_side_effects + 1,
+  /* data is (struct rx_distinct_future *) */
+
+  /* 
+   * RX_NEXT_CHAR is called to consume the next character and take the
+   * corresponding transition.  This is the only instruction that uses 
+   * the DATA field of the instruction frame instead of DATA_2.
+   * (see EXPLORE_FUTURE in regex.c).
+   */
+  rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
+
+  /* RX_BACKTRACK indicates that a transition fails.
+   */
+  rx_backtrack = rx_next_char + 1, /* no data */
+
+  /* 
+   * RX_ERROR_INX is stored only in places that should never be executed.
+   */
+  rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
+
+  rx_num_instructions = rx_error_inx + 1
+};
+
+/* An id_instruction_table holds the values stored in instruction
+ * frames.  The table is indexed by the enums declared above.
+ */
+extern void * rx_id_instruction_table[rx_num_instructions];
+
+/* The heart of the matcher is a `word-code-interpreter' 
+ * (like a byte-code interpreter, except that instructions
+ * are a full word wide).
+ *
+ * Instructions are not stored in a vector of code, instead,
+ * they are scattered throughout the data structures built
+ * by the regexp compiler and the matcher.  One word-code instruction,
+ * together with the arguments to that instruction, constitute
+ * an instruction frame (struct rx_inx).
+ *
+ * This structure type is padded by hand to a power of 2 because
+ * in one of the dominant cases, we dispatch by indexing a table
+ * of instruction frames.  If that indexing can be accomplished
+ * by just a shift of the index, we're happy.
+ *
+ * Instructions take at most one argument, but there are two
+ * slots in an instruction frame that might hold that argument.
+ * These are called data and data_2.  The data slot is only
+ * used for one instruction (RX_NEXT_CHAR).  For all other 
+ * instructions, data should be set to 0.
+ *
+ * RX_NEXT_CHAR is the most important instruction by far.
+ * By reserving the data field for its exclusive use, 
+ * instruction dispatch is sped up in that case.  There is
+ * no need to fetch both the instruction and the data,
+ * only the data is needed.  In other words, a `cycle' begins
+ * by fetching the field data.  If that is non-0, then it must
+ * be the destination state of a next_char transition, so
+ * make that value the current state, advance the match position
+ * by one character, and start a new cycle.  On the other hand,
+ * if data is 0, fetch the instruction and do a more complicated
+ * dispatch on that.
+ */
+
+struct rx_inx 
+{
+  void * data;
+  void * data_2;
+  void * inx;
+  void * fnord;
+};
+
+#ifndef RX_TAIL_ARRAY
+#define RX_TAIL_ARRAY  1
+#endif
+
+/* A superstate corresponds to a set of nfa states.  Those sets are
+ * represented by STRUCT RX_SUPERSET.  The constructors
+ * guarantee that only one (shared) structure is created for a given set.
+ */
+struct rx_superset
+{
+  int refs;			/* This is a reference counted structure. */
+
+  /* We keep these sets in a cache because (in an unpredictable way),
+   * the same set is often created again and again.  But that is also
+   * problematic -- compatibility with POSIX and GNU regex requires
+   * that we not be able to tell when a program discards a particular
+   * NFA (thus invalidating the supersets created from it).
+   *
+   * But when a cache hit appears to occur, we will have in hand the
+   * nfa for which it may have happened.  That is why every nfa is given
+   * its own sequence number.  On a cache hit, the cache is validated
+   * by comparing the nfa sequence number to this field:
+   */
+  int id;
+
+  struct rx_nfa_state * car;	/* May or may not be a valid addr. */
+  struct rx_superset * cdr;
+
+  /* If the corresponding superstate exists: */
+  struct rx_superstate * superstate;
+
+
+  /* There is another bookkeeping problem.  It is expensive to 
+   * compute the starting nfa state set for an nfa.  So, once computed,
+   * it is cached in the `struct rx'.
+   *
+   * But, the state set can be flushed from the superstate cache.
+   * When that happens, we can't know if the corresponding `struct rx'
+   * is still alive or if it has been freed or re-used by the program.
+   * So, the cached pointer to this set in a struct rx might be invalid
+   * and we need a way to validate it.
+   *
+   * Fortunately, even if this set is flushed from the cache, it is
+   * not freed.  It just goes on the free-list of supersets.
+   * So we can still examine it.  
+   *
+   * So to validate a starting set memo, check to see if the
+   * starts_for field still points back to the struct rx in question,
+   * and if the ID matches the rx sequence number.
+   */
+  struct rx * starts_for;
+
+  /* This is used to link into a hash bucket so these objects can
+   * be `hash-consed'.
+   */
+  struct rx_hash_item hash_item;
+};
+
+#define rx_protect_superset(RX,CON) (++(CON)->refs)
+
+/* The terminology may be confusing (rename this structure?).
+ * Every character occurs in at most one rx_super_edge per super-state.
+ * But, that structure might have more than one option, indicating a point
+ * of non-determinism. 
+ *
+ * In other words, this structure holds a list of superstate edges
+ * sharing a common starting state and character label.  The edges
+ * are in the field OPTIONS.  All superstate edges sharing the same
+ * starting state and character are in this list.
+ */
+struct rx_super_edge
+{
+  struct rx_super_edge *next;
+  struct rx_inx rx_backtrack_frame;
+  int cset_size;
+  rx_Bitset cset;
+  struct rx_distinct_future *options;
+};
+
+/* A superstate is a set of nfa states (RX_SUPERSET) along
+ * with a transition table.  Superstates are built on demand and reclaimed
+ * without warning.  To protect a superstate from this ghastly fate,
+ * use LOCK_SUPERSTATE. 
+ */
+struct rx_superstate
+{
+  int rx_id;			/* c.f. the id field of rx_superset */
+  int locks;			/* protection from reclamation */
+
+  /* Within a superstate cache, all the superstates are kept in a big
+   * queue.  The tail of the queue is the state most likely to be
+   * reclaimed.  The *recyclable fields hold the queue position of 
+   * this state.
+   */
+  struct rx_superstate * next_recyclable;
+  struct rx_superstate * prev_recyclable;
+
+  /* The supernfa edges that exist in the cache and that have
+   * this state as their destination are kept in this list:
+   */
+  struct rx_distinct_future * transition_refs;
+
+  /* The list of nfa states corresponding to this superstate: */
+  struct rx_superset * contents;
+
+  /* The list of edges in the cache beginning from this state. */
+  struct rx_super_edge * edges;
+
+  /* A tail of the recyclable queue is marked as semifree.  A semifree
+   * state has no incoming next_char transitions -- any transition
+   * into a semifree state causes a complex dispatch with the side
+   * effect of rescuing the state from its semifree state.
+   *
+   * An alternative to this might be to make next_char more expensive,
+   * and to move a state to the head of the recyclable queue whenever
+   * it is entered.  That way, popular states would never be recycled.
+   *
+   * But unilaterally making next_char more expensive actually loses.
+   * So, incoming transitions are only made expensive for states near
+   * the tail of the recyclable queue.  The more cache contention
+   * there is, the more frequently a state will have to prove itself
+   * and be moved back to the front of the queue.  If there is less 
+   * contention, then popular states just aggregate in the front of 
+   * the queue and stay there.
+   */
+  int is_semifree;
+
+
+  /* This keeps track of the size of the transition table for this
+   * state.  There is a half-hearted attempt to support variable sized
+   * superstates.
+   */
+  int trans_size;
+
+  /* Indexed by characters... */
+  struct rx_inx transitions[RX_TAIL_ARRAY];
+};
+
+
+/* A list of distinct futures define the edges that leave from a 
+ * given superstate on a given character.  c.f. rx_super_edge.
+ */
+
+struct rx_distinct_future
+{
+  struct rx_distinct_future * next_same_super_edge[2];
+  struct rx_distinct_future * next_same_dest;
+  struct rx_distinct_future * prev_same_dest;
+  struct rx_superstate * present;	/* source state */
+  struct rx_superstate * future;	/* destination state */
+  struct rx_super_edge * edge;
+
+
+  /* The future_frame holds the instruction that should be executed
+   * after all the side effects are done, when it is time to complete
+   * the transition to the next state.
+   *
+   * Normally this is a next_char instruction, but it may be a
+   * cache_miss instruction as well, depending on whether or not
+   * the superstate is in the cache and semifree.
+   * 
+   * If this is the only future for a given superstate/char, and
+   * if there are no side effects to be performed, this frame is
+   * not used (directly) at all.  Instead, its contents are copied
+   * into the transition table of the starting state of this dist. future.
+   */
+  struct rx_inx future_frame;
+
+  struct rx_inx side_effects_frame;
+  struct rx_se_list * effects;
+};
+
+#define rx_lock_superstate(R,S)  ((S)->locks++)
+#define rx_unlock_superstate(R,S) (--(S)->locks)
+
+
+/* This page destined for rx.h */
+
+struct rx_blocklist
+{
+  struct rx_blocklist * next;
+  int bytes;
+};
+
+struct rx_freelist
+{
+  struct rx_freelist * next;
+};
+
+struct rx_cache;
+
+#ifdef __STDC__
+typedef void (*rx_morecore_fn)(struct rx_cache *);
+#else
+typedef void (*rx_morecore_fn)();
+#endif
+
+/* You use this to control the allocation of superstate data 
+ * during matching.  Most of it should be initialized to 0.
+ *
+ * A MORECORE function is necessary.  It should allocate
+ * a new block of memory or return 0.
+ * A default that uses malloc is called `rx_morecore'.
+ *
+ * The number of SUPERSTATES_ALLOWED indirectly limits how much memory
+ * the system will try to allocate.  The default is 128.  Batch style
+ * applications that are very regexp intensive should use as high a number
+ * as possible without thrashing.
+ * 
+ * The LOCAL_CSET_SIZE is the number of characters in a character set.
+ * It is therefore the number of entries in a superstate transition table.
+ * Generally, it should be 256.  If your character set has 16 bits, 
+ * it is better to translate your regexps into equivalent 8 bit patterns.
+ */
+
+struct rx_cache
+{
+  struct rx_hash_rules superset_hash_rules;
+
+  /* Objects are allocated by incrementing a pointer that 
+   * scans across rx_blocklists.
+   */
+  struct rx_blocklist * memory;
+  struct rx_blocklist * memory_pos;
+  int bytes_left;
+  char * memory_addr;
+  rx_morecore_fn morecore;
+
+  /* Freelists. */
+  struct rx_freelist * free_superstates;
+  struct rx_freelist * free_transition_classes;
+  struct rx_freelist * free_discernable_futures;
+  struct rx_freelist * free_supersets;
+  struct rx_freelist * free_hash;
+
+  /* Two sets of superstates -- those that are semifreed, and those
+   * that are being used.
+   */
+  struct rx_superstate * lru_superstate;
+  struct rx_superstate * semifree_superstate;
+
+  struct rx_superset * empty_superset;
+
+  int superstates;
+  int semifree_superstates;
+  int hits;
+  int misses;
+  int superstates_allowed;
+
+  int local_cset_size;
+  void ** instruction_table;
+
+  struct rx_hash superset_table;
+};
+
+
+
+/* The lowest-level search function supports arbitrarily fragmented
+ * strings and (optionally) suspendable/resumable searches.
+ *
+ * Callers have to provide a few hooks.
+ */
+
+#ifndef __GNUC__
+#ifdef __STDC__
+#define __const__ const
+#else
+#define __const__
+#endif
+#endif
+
+/* This holds a matcher position */
+struct rx_string_position
+{
+  __const__ unsigned char * pos;	/* The current pos. */
+  __const__ unsigned char * string; /* The current string burst. */
+  __const__ unsigned char * end;	/* First invalid position >= POS. */
+  int offset;			/* Integer address of the current burst. */
+  int size;			/* Current string's size. */
+  int search_direction;		/* 1 or -1 */
+  int search_end;		/* First position to not try. */
+};
+
+
+enum rx_get_burst_return
+{
+  rx_get_burst_continuation,
+  rx_get_burst_error,
+  rx_get_burst_ok,
+  rx_get_burst_no_more
+};
+
+
+/* A call to get burst should make POS valid.  It might be invalid
+ * if the STRING field doesn't point to a burst that actually
+ * contains POS.
+ *
+ * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to
+ * whether or not to pad to the left.  Padding to the right is always
+ * appropriate, but need not go past the point indicated by STOP.
+ *
+ * If a continuation is returned, then the reentering call to
+ * a search function will retry the get_burst.
+ */
+
+#ifdef __STDC__
+typedef enum rx_get_burst_return
+  (*rx_get_burst_fn) (struct rx_string_position * pos,
+		      void * app_closure,
+		      int stop);
+					       
+#else