summaryrefslogtreecommitdiff
path: root/include/regex.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/regex.h')
-rw-r--r--include/regex.h3754
1 files changed, 3754 insertions, 0 deletions
diff --git a/include/regex.h b/include/regex.h
new file mode 100644
index 000000000..813882c42
--- /dev/null
+++ b/include/regex.h
@@ -0,0 +1,3754 @@
+#if !defined(_RX_H) || defined(RX_WANT_SE_DEFS)
+#define _RX_H
+
+/* Copyright (C) 1992, 1993 Free Software Foundation, Inc.
+
+This file is part of the librx library.
+
+Librx is free software; you can redistribute it and/or modify it under
+the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+Librx is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with this software; see the file COPYING.LIB. If not,
+write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
+02139, USA. */
+/* t. lord Wed Sep 23 18:20:57 1992 */
+
+
+
+
+#include <features.h>
+
+#define __need_size_t
+#include <stddef.h>
+
+#include <string.h>
+
+#if RX_WANT_SE_DEFS != 1
+__BEGIN_DECLS
+#endif
+
+#ifndef RX_WANT_SE_DEFS
+
+/* This page: Bitsets */
+
+#ifndef RX_subset
+typedef unsigned int RX_subset;
+#define RX_subset_bits (32)
+#define RX_subset_mask (RX_subset_bits - 1)
+#endif
+
+typedef RX_subset * rx_Bitset;
+
+#ifdef __STDC__
+typedef void (*rx_bitset_iterator) (void *, int member_index);
+#else
+typedef void (*rx_bitset_iterator) ();
+#endif
+
+#define rx_bitset_subset(N) ((N) / RX_subset_bits)
+#define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)])
+#define RX_bitset_access(B,N,OP) \
+ ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
+#define RX_bitset_member(B,N) RX_bitset_access(B, N, &)
+#define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=)
+#define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~)
+#define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= )
+#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits)
+#define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset))
+
+
+
+/* This page: Splay trees. */
+
+#ifdef __STDC__
+typedef int (*rx_sp_comparer) (void * a, void * b);
+#else
+typedef int (*rx_sp_comparer) ();
+#endif
+
+struct rx_sp_node
+{
+ void * key;
+ void * data;
+ struct rx_sp_node * kids[2];
+};
+
+#ifdef __STDC__
+typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
+#else
+typedef void (*rx_sp_key_data_freer) ();
+#endif
+
+
+/* giant inflatable hash trees */
+
+struct rx_hash_item
+{
+ struct rx_hash_item * next_same_hash;
+ struct rx_hash * table;
+ unsigned long hash;
+ void * data;
+ void * binding;
+};
+
+struct rx_hash
+{
+ struct rx_hash * parent;
+ int refs;
+ struct rx_hash * children[13];
+ struct rx_hash_item * buckets [13];
+ int bucket_size [13];
+};
+
+struct rx_hash_rules;
+
+#ifdef __STDC__
+/* should return like == */
+typedef int (*rx_hash_eq)(void *, void *);
+typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
+typedef void (*rx_free_hash)(struct rx_hash *,
+ struct rx_hash_rules *);
+typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
+ void *);
+typedef void (*rx_free_hash_item)(struct rx_hash_item *,
+ struct rx_hash_rules *);
+#else
+typedef int (*rx_hash_eq)();
+typedef struct rx_hash * (*rx_alloc_hash)();
+typedef void (*rx_free_hash)();
+typedef struct rx_hash_item * (*rx_alloc_hash_item)();
+typedef void (*rx_free_hash_item)();
+#endif
+
+struct rx_hash_rules
+{
+ rx_hash_eq eq;
+ rx_alloc_hash hash_alloc;
+ rx_free_hash free_hash;
+ rx_alloc_hash_item hash_item_alloc;
+ rx_free_hash_item free_hash_item;
+};
+
+
+/* Forward declarations */
+
+struct rx_cache;
+struct rx_superset;
+struct rx;
+struct rx_se_list;
+
+
+
+/*
+ * GLOSSARY
+ *
+ * regexp
+ * regular expression
+ * expression
+ * pattern - a `regular' expression. The expression
+ * need not be formally regular -- it can contain
+ * constructs that don't correspond to purely regular
+ * expressions.
+ *
+ * buffer
+ * string - the string (or strings) being searched or matched.
+ *
+ * pattern buffer - a structure of type `struct re_pattern_buffer'
+ * This in turn contains a `struct rx', which holds the
+ * NFA compiled from a pattern, as well as some of the state
+ * of a matcher using the pattern.
+ *
+ * NFA - nondeterministic finite automata. Some people
+ * use this term to a member of the class of
+ * regular automata (those corresponding to a regular
+ * language). However, in this code, the meaning is
+ * more general. The automata used by Rx are comperable
+ * in power to what are usually called `push down automata'.
+ *
+ * Two NFA are built by rx for every pattern. One is built
+ * by the compiler. The other is built from the first, on
+ * the fly, by the matcher. The latter is called the `superstate
+ * NFA' because its states correspond to sets of states from
+ * the first NFA. (Joe Keane gets credit for the name
+ * `superstate NFA').
+ *
+ * NFA edges
+ * epsilon edges
+ * side-effect edges - The NFA compiled from a pattern can have three
+ * kinds of edges. Epsilon edges can be taken freely anytime
+ * their source state is reached. Character set edges can be
+ * taken when their source state is reached and when the next
+ * character in the buffer is a member of the set. Side effect
+ * edges imply a transition that can only be taken after the
+ * indicated side effect has been successfully accomplished.
+ * Some examples of side effects are:
+ *
+ * Storing the current match position to record the
+ * location of a parentesized subexpression.
+ *
+ * Advancing the matcher over N characters if they
+ * match the N characters previously matched by a
+ * parentesized subexpression.
+ *
+ * Both of those kinds of edges occur in the NFA generated
+ * by the pattern: \(.\)\1
+ *
+ * Epsilon and side effect edges are similar. Unfortunately,
+ * some of the code uses the name `epsilon edge' to mean
+ * both epsilon and side effect edges. For example, the
+ * function has_non_idempotent_epsilon_path computes the existance
+ * of a non-trivial path containing only a mix of epsilon and
+ * side effect edges. In that case `nonidempotent epsilon' is being
+ * used to mean `side effect'.
+ */
+
+
+
+
+
+/* LOW LEVEL PATTERN BUFFERS */
+
+/* Suppose that from some NFA state, more than one path through
+ * side-effect edges is possible. In what order should the paths
+ * be tried? A function of type rx_se_list_order answers that
+ * question. It compares two lists of side effects, and says
+ * which list comes first.
+ */
+
+#ifdef __STDC__
+typedef int (*rx_se_list_order) (struct rx *,
+ struct rx_se_list *,
+ struct rx_se_list *);
+#else
+typedef int (*rx_se_list_order) ();
+#endif
+
+
+
+/* Struct RX holds a compiled regular expression - that is, an nfa
+ * ready to be converted on demand to a more efficient superstate nfa.
+ * This is for the low level interface. The high-level interfaces enclose
+ * this in a `struct re_pattern_buffer'.
+ */
+struct rx
+{
+ /* The compiler assigns a unique id to every pattern.
+ * Like sequence numbers in X, there is a subtle bug here
+ * if you use Rx in a system that runs for a long time.
+ * But, because of the way the caches work out, it is almost
+ * impossible to trigger the Rx version of this bug.
+ *
+ * The id is used to validate superstates found in a cache
+ * of superstates. It isn't sufficient to let a superstate
+ * point back to the rx for which it was compiled -- the caller
+ * may be re-using a `struct rx' in which case the superstate
+ * is not really valid. So instead, superstates are validated
+ * by checking the sequence number of the pattern for which
+ * they were built.
+ */
+ int rx_id;
+
+ /* This is memory mgt. state for superstates. This may be
+ * shared by more than one struct rx.
+ */
+ struct rx_cache * cache;
+
+ /* Every regex defines the size of its own character set.
+ * A superstate has an array of this size, with each element
+ * a `struct rx_inx'. So, don't make this number too large.
+ * In particular, don't make it 2^16.
+ */
+ int local_cset_size;
+
+ /* After the NFA is built, it is copied into a contiguous region
+ * of memory (mostly for compatability with GNU regex).
+ * Here is that region, and it's size:
+ */
+ void * buffer;
+ unsigned long allocated;
+
+ /* Clients of RX can ask for some extra storage in the space pointed
+ * to by BUFFER. The field RESERVED is an input parameter to the
+ * compiler. After compilation, this much space will be available
+ * at (buffer + allocated - reserved)
+ */
+ unsigned long reserved;
+
+ /* --------- The remaining fields are for internal use only. --------- */
+ /* --------- But! they must be initialized to 0. --------- */
+
+ /* NODEC is the number of nodes in the NFA with non-epsilon
+ * transitions.
+ */
+ int nodec;
+
+ /* EPSNODEC is the number of nodes with only epsilon transitions. */
+ int epsnodec;
+
+ /* The sum (NODEC + EPSNODEC) is the total number of states in the
+ * compiled NFA.
+ */
+
+ /* Lists of side effects as stored in the NFA are `hash consed'..meaning
+ * that lists with the same elements are ==. During compilation,
+ * this table facilitates hash-consing.
+ */
+ struct rx_hash se_list_memo;
+
+ /* Lists of NFA states are also hashed.
+ */
+ struct rx_hash set_list_memo;
+
+
+
+
+ /* The compiler and matcher must build a number of instruction frames.
+ * The format of these frames is fixed (c.f. struct rx_inx). The values
+ * of the instructions is not fixed.
+ *
+ * An enumerated type (enum rx_opcode) defines the set of instructions
+ * that the compiler or matcher might generate. When filling an instruction
+ * frame, the INX field is found by indexing this instruction table
+ * with an opcode:
+ */
+ void ** instruction_table;
+
+ /* The list of all states in an NFA.
+ * During compilation, the NEXT field of NFA states links this list.
+ * After compilation, all the states are compacted into an array,
+ * ordered by state id numbers. At that time, this points to the base
+ * of that array.
+ */
+ struct rx_nfa_state *nfa_states;
+
+ /* Every nfa begins with one distinguished starting state:
+ */
+ struct rx_nfa_state *start;
+
+ /* This orders the search through super-nfa paths.
+ * See the comment near the typedef of rx_se_list_order.
+ */
+ rx_se_list_order se_list_cmp;
+
+ struct rx_superset * start_set;
+};
+
+
+
+
+/* SYNTAX TREES */
+
+/* Compilation is in stages.
+ *
+ * In the first stage, a pattern specified by a string is
+ * translated into a syntax tree. Later stages will convert
+ * the syntax tree into an NFA optimized for conversion to a
+ * superstate-NFA.
+ *
+ * This page is about syntax trees.
+ */
+
+enum rexp_node_type
+{
+ r_cset, /* Match from a character set. `a' or `[a-z]'*/
+ r_concat, /* Concat two subexpressions. `ab' */
+ r_alternate, /* Choose one of two subexpressions. `a\|b' */
+ r_opt, /* Optional subexpression. `a?' */
+ r_star, /* Repeated subexpression. `a*' */
+
+
+ /* A 2phase-star is a variation on a repeated subexpression.
+ * In this case, there are two subexpressions. The first, if matched,
+ * begins a repitition (otherwise, the whole expression is matches the
+ * empth string).
+ *
+ * After matching the first subexpression, a 2phase star either finishes,
+ * or matches the second subexpression. If the second subexpression is
+ * matched, then the whole construct repeats.
+ *
+ * 2phase stars are used in two circumstances. First, they
+ * are used as part of the implementation of POSIX intervals (counted
+ * repititions). Second, they are used to implement proper star
+ * semantics when the repeated subexpression contains paths of
+ * only side effects. See rx_compile for more information.
+ */
+ r_2phase_star,
+
+
+ /* c.f. "typedef void * rx_side_effect" */
+ r_side_effect,
+
+ /* This is an extension type: It is for transient use in source->source
+ * transformations (implemented over syntax trees).
+ */
+ r_data
+};
+
+/* A side effect is a matcher-specific action associated with
+ * transitions in the NFA. The details of side effects are up
+ * to the matcher. To the compiler and superstate constructors
+ * side effects are opaque:
+ */
+
+typedef void * rx_side_effect;
+
+/* Nodes in a syntax tree are of this type:
+ */
+struct rexp_node
+{
+ enum rexp_node_type type;
+ union
+ {
+ rx_Bitset cset;
+ rx_side_effect side_effect;
+ struct
+ {
+ struct rexp_node *left;
+ struct rexp_node *right;
+ } pair;
+ void * data;
+ } params;
+};
+
+
+
+/* NFA
+ *
+ * A syntax tree is compiled into an NFA. This page defines the structure
+ * of that NFA.
+ */
+
+struct rx_nfa_state
+{
+ /* These are kept in a list as the NFA is being built. */
+ struct rx_nfa_state *next;
+
+ /* After the NFA is built, states are given integer id's.
+ * States whose outgoing transitions are all either epsilon or
+ * side effect edges are given ids less than 0. Other states
+ * are given successive non-negative ids starting from 0.
+ */
+ int id;
+
+ /* The list of NFA edges that go from this state to some other. */
+ struct rx_nfa_edge *edges;
+
+ /* If you land in this state, then you implicitly land
+ * in all other states reachable by only epsilon translations.
+ * Call the set of maximal paths to such states the epsilon closure
+ * of this state.
+ *
+ * There may be other states that are reachable by a mixture of
+ * epsilon and side effect edges. Consider the set of maximal paths
+ * of that sort from this state. Call it the epsilon-side-effect
+ * closure of the state.
+ *
+ * The epsilon closure of the state is a subset of the epsilon-side-
+ * effect closure. It consists of all the paths that contain
+ * no side effects -- only epsilon edges.
+ *
+ * The paths in the epsilon-side-effect closure can be partitioned
+ * into equivalance sets. Two paths are equivalant if they have the
+ * same set of side effects, in the same order. The epsilon-closure
+ * is one of these equivalance sets. Let's call these equivalance
+ * sets: observably equivalant path sets. That name is chosen
+ * because equivalance of two paths means they cause the same side
+ * effects -- so they lead to the same subsequent observations other
+ * than that they may wind up in different target states.
+ *
+ * The superstate nfa, which is derived from this nfa, is based on
+ * the observation that all of the paths in an observably equivalant
+ * path set can be explored at the same time, provided that the
+ * matcher keeps track not of a single nfa state, but of a set of
+ * states. In particular, after following all the paths in an
+ * observably equivalant set, you wind up at a set of target states.
+ * That set of target states corresponds to one state in the
+ * superstate NFA.
+ *
+ * Staticly, before matching begins, it is convenient to analyze the
+ * nfa. Each state is labeled with a list of the observably
+ * equivalant path sets who's union covers all the
+ * epsilon-side-effect paths beginning in this state. This list is
+ * called the possible futures of the state.
+ *
+ * A trivial example is this NFA:
+ * s1
+ * A ---> B
+ *
+ * s2
+ * ---> C
+ *
+ * epsilon s1
+ * ---------> D ------> E
+ *
+ *
+ * In this example, A has two possible futures.
+ * One invokes the side effect `s1' and contains two paths,
+ * one ending in state B, the other in state E.
+ * The other invokes the side effect `s2' and contains only
+ * one path, landing in state C.
+ */
+ struct rx_possible_future *futures;
+
+
+ /* There are exactly two distinguished states in every NFA: */
+ unsigned int is_final:1;
+ unsigned int is_start:1;
+
+ /* These are used during NFA construction... */
+ unsigned int eclosure_needed:1;
+ unsigned int mark:1;
+};
+
+
+/* An edge in an NFA is typed: */
+enum rx_nfa_etype
+{
+ /* A cset edge is labled with a set of characters one of which
+ * must be matched for the edge to be taken.
+ */
+ ne_cset,
+
+ /* An epsilon edge is taken whenever its starting state is
+ * reached.
+ */
+ ne_epsilon,
+
+ /* A side effect edge is taken whenever its starting state is
+ * reached. Side effects may cause the match to fail or the
+ * position of the matcher to advance.
+ */
+ ne_side_effect /* A special kind of epsilon. */
+};
+
+struct rx_nfa_edge
+{
+ struct rx_nfa_edge *next;
+ enum rx_nfa_etype type;
+ struct rx_nfa_state *dest;
+ union
+ {
+ rx_Bitset cset;
+ rx_side_effect side_effect;
+ } params;
+};
+
+
+
+/* A possible future consists of a list of side effects
+ * and a set of destination states. Below are their
+ * representations. These structures are hash-consed which
+ * means that lists with the same elements share a representation
+ * (their addresses are ==).
+ */
+
+struct rx_nfa_state_set
+{
+ struct rx_nfa_state * car;
+ struct rx_nfa_state_set * cdr;
+};
+
+struct rx_se_list
+{
+ rx_side_effect car;
+ struct rx_se_list * cdr;
+};
+
+struct rx_possible_future
+{
+ struct rx_possible_future *next;
+ struct rx_se_list * effects;
+ struct rx_nfa_state_set * destset;
+};
+
+
+
+/* This begins the description of the superstate NFA.
+ *
+ * The superstate NFA corresponds to the NFA in these ways:
+ *
+ * Every superstate NFA states SUPER correspond to sets of NFA states,
+ * nfa_states(SUPER).
+ *
+ * Superstate edges correspond to NFA paths.
+ *
+ * The superstate has no epsilon transitions;
+ * every edge has a character label, and a (possibly empty) side
+ * effect label. The side effect label corresponds to a list of
+ * side effects that occur in the NFA. These parts are referred
+ * to as: superedge_character(EDGE) and superedge_sides(EDGE).
+ *
+ * For a superstate edge EDGE starting in some superstate SUPER,
+ * the following is true (in pseudo-notation :-):
+ *
+ * exists DEST in nfa_states s.t.
+ * exists nfaEDGE in nfa_edges s.t.
+ * origin (nfaEDGE) == DEST
+ * && origin (nfaEDGE) is a member of nfa_states(SUPER)
+ * && exists PF in possible_futures(dest(nfaEDGE)) s.t.
+ * sides_of_possible_future (PF) == superedge_sides (EDGE)
+ *
+ * also:
+ *
+ * let SUPER2 := superedge_destination(EDGE)
+ * nfa_states(SUPER2)
+ * == union of all nfa state sets S s.t.
+ * exists PF in possible_futures(dest(nfaEDGE)) s.t.
+ * sides_of_possible_future (PF) == superedge_sides (EDGE)
+ * && S == dests_of_possible_future (PF) }
+ *
+ * Or in english, every superstate is a set of nfa states. A given
+ * character and a superstate implies many transitions in the NFA --
+ * those that begin with an edge labeled with that character from a
+ * state in the set corresponding to the superstate.
+ *
+ * The destinations of those transitions each have a set of possible
+ * futures. A possible future is a list of side effects and a set of
+ * destination NFA states. Two sets of possible futures can be
+ * `merged' by combining all pairs of possible futures that have the
+ * same side effects. A pair is combined by creating a new future
+ * with the same side effect but the union of the two destination sets.
+ * In this way, all the possible futures suggested by a superstate
+ * and a character can be merged into a set of possible futures where
+ * no two elements of the set have the same set of side effects.
+ *
+ * The destination of a possible future, being a set of NFA states,
+ * corresponds to a supernfa state. So, the merged set of possible
+ * futures we just created can serve as a set of edges in the
+ * supernfa.
+ *
+ * The representation of the superstate nfa and the nfa is critical.
+ * The nfa has to be compact, but has to facilitate the rapid
+ * computation of missing superstates. The superstate nfa has to
+ * be fast to interpret, lazilly constructed, and bounded in space.
+ *
+ * To facilitate interpretation, the superstate data structures are
+ * peppered with `instruction frames'. There is an instruction set
+ * defined below which matchers using the supernfa must be able to
+ * interpret.
+ *
+ * We'd like to make it possible but not mandatory to use code
+ * addresses to represent instructions (c.f. gcc's computed goto).
+ * Therefore, we define an enumerated type of opcodes, and when
+ * writing one of these instructions into a data structure, use
+ * the opcode as an index into a table of instruction values.
+ *
+ * Here are the opcodes that occur in the superstate nfa:
+ */
+
+
+/* Every superstate contains a table of instruction frames indexed
+ * by characters. A normal `move' in a matcher is to fetch the next
+ * character and use it as an index into a superstates transition
+ * table.
+ *
+ * In the fasted case, only one edge follows from that character.
+ * In other cases there is more work to do.
+ *
+ * The descriptions of the opcodes refer to data structures that are
+ * described further below.
+ */
+
+enum rx_opcode
+{
+ /*
+ * BACKTRACK_POINT is invoked when a character transition in
+ * a superstate leads to more than one edge. In that case,
+ * the edges have to be explored independently using a backtracking
+ * strategy.
+ *
+ * A BACKTRACK_POINT instruction is stored in a superstate's
+ * transition table for some character when it is known that that
+ * character crosses more than one edge. On encountering this
+ * instruction, the matcher saves enough state to backtrack to this
+ * point in the match later.
+ */
+ rx_backtrack_point = 0, /* data is (struct transition_class *) */
+
+ /*
+ * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
+ * There is one occurence of this instruction per rx_distinct_future.
+ * This instruction is skipped if a rx_distinct_future has no side effects.
+ */
+ rx_do_side_effects = rx_backtrack_point + 1,
+
+ /* data is (struct rx_distinct_future *) */
+
+ /*
+ * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
+ * destination superstate has been reclaimed (or was never built).
+ * It recomputes the destination superstate.
+ * RX_CACHE_MISS is also stored in a superstate transition table before
+ * any of its edges have been built.
+ */
+ rx_cache_miss = rx_do_side_effects + 1,
+ /* data is (struct rx_distinct_future *) */
+
+ /*
+ * RX_NEXT_CHAR is called to consume the next character and take the
+ * corresponding transition. This is the only instruction that uses
+ * the DATA field of the instruction frame instead of DATA_2.
+ * (see EXPLORE_FUTURE in regex.c).
+ */
+ rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
+
+ /* RX_BACKTRACK indicates that a transition fails.
+ */
+ rx_backtrack = rx_next_char + 1, /* no data */
+
+ /*
+ * RX_ERROR_INX is stored only in places that should never be executed.
+ */
+ rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
+
+ rx_num_instructions = rx_error_inx + 1
+};
+
+/* An id_instruction_table holds the values stored in instruction
+ * frames. The table is indexed by the enums declared above.
+ */
+extern void * rx_id_instruction_table[rx_num_instructions];
+
+/* The heart of the matcher is a `word-code-interpreter'
+ * (like a byte-code interpreter, except that instructions
+ * are a full word wide).
+ *
+ * Instructions are not stored in a vector of code, instead,
+ * they are scattered throughout the data structures built
+ * by the regexp compiler and the matcher. One word-code instruction,
+ * together with the arguments to that instruction, constitute
+ * an instruction frame (struct rx_inx).
+ *
+ * This structure type is padded by hand to a power of 2 because
+ * in one of the dominant cases, we dispatch by indexing a table
+ * of instruction frames. If that indexing can be accomplished
+ * by just a shift of the index, we're happy.
+ *
+ * Instructions take at most one argument, but there are two
+ * slots in an instruction frame that might hold that argument.
+ * These are called data and data_2. The data slot is only
+ * used for one instruction (RX_NEXT_CHAR). For all other
+ * instructions, data should be set to 0.
+ *
+ * RX_NEXT_CHAR is the most important instruction by far.
+ * By reserving the data field for its exclusive use,
+ * instruction dispatch is sped up in that case. There is
+ * no need to fetch both the instruction and the data,
+ * only the data is needed. In other words, a `cycle' begins
+ * by fetching the field data. If that is non-0, then it must
+ * be the destination state of a next_char transition, so
+ * make that value the current state, advance the match position
+ * by one character, and start a new cycle. On the other hand,
+ * if data is 0, fetch the instruction and do a more complicated
+ * dispatch on that.
+ */
+
+struct rx_inx
+{
+ void * data;
+ void * data_2;
+ void * inx;
+ void * fnord;
+};
+
+#ifndef RX_TAIL_ARRAY
+#define RX_TAIL_ARRAY 1
+#endif
+
+/* A superstate corresponds to a set of nfa states. Those sets are
+ * represented by STRUCT RX_SUPERSET. The constructors
+ * guarantee that only one (shared) structure is created for a given set.
+ */
+struct rx_superset
+{
+ int refs; /* This is a reference counted structure. */
+
+ /* We keep these sets in a cache because (in an unpredictable way),
+ * the same set is often created again and again. But that is also
+ * problematic -- compatibility with POSIX and GNU regex requires
+ * that we not be able to tell when a program discards a particular
+ * NFA (thus invalidating the supersets created from it).
+ *
+ * But when a cache hit appears to occur, we will have in hand the
+ * nfa for which it may have happened. That is why every nfa is given
+ * its own sequence number. On a cache hit, the cache is validated
+ * by comparing the nfa sequence number to this field:
+ */
+ int id;
+
+ struct rx_nfa_state * car; /* May or may not be a valid addr. */
+ struct rx_superset * cdr;
+
+ /* If the corresponding superstate exists: */
+ struct rx_superstate * superstate;
+
+
+ /* There is another bookkeeping problem. It is expensive to
+ * compute the starting nfa state set for an nfa. So, once computed,
+ * it is cached in the `struct rx'.
+ *
+ * But, the state set can be flushed from the superstate cache.
+ * When that happens, we can't know if the corresponding `struct rx'
+ * is still alive or if it has been freed or re-used by the program.
+ * So, the cached pointer to this set in a struct rx might be invalid
+ * and we need a way to validate it.
+ *
+ * Fortunately, even if this set is flushed from the cache, it is
+ * not freed. It just goes on the free-list of supersets.
+ * So we can still examine it.
+ *
+ * So to validate a starting set memo, check to see if the
+ * starts_for field still points back to the struct rx in question,
+ * and if the ID matches the rx sequence number.
+ */
+ struct rx * starts_for;
+
+ /* This is used to link into a hash bucket so these objects can
+ * be `hash-consed'.
+ */
+ struct rx_hash_item hash_item;
+};
+
+#define rx_protect_superset(RX,CON) (++(CON)->refs)
+
+/* The terminology may be confusing (rename this structure?).
+ * Every character occurs in at most one rx_super_edge per super-state.
+ * But, that structure might have more than one option, indicating a point
+ * of non-determinism.
+ *
+ * In other words, this structure holds a list of superstate edges
+ * sharing a common starting state and character label. The edges
+ * are in the field OPTIONS. All superstate edges sharing the same
+ * starting state and character are in this list.
+ */
+struct rx_super_edge
+{
+ struct rx_super_edge *next;
+ struct rx_inx rx_backtrack_frame;
+ int cset_size;
+ rx_Bitset cset;
+ struct rx_distinct_future *options;
+};
+
+/* A superstate is a set of nfa states (RX_SUPERSET) along
+ * with a transition table. Superstates are built on demand and reclaimed
+ * without warning. To protect a superstate from this ghastly fate,
+ * use LOCK_SUPERSTATE.
+ */
+struct rx_superstate
+{
+ int rx_id; /* c.f. the id field of rx_superset */
+ int locks; /* protection from reclamation */
+
+ /* Within a superstate cache, all the superstates are kept in a big
+ * queue. The tail of the queue is the state most likely to be
+ * reclaimed. The *recyclable fields hold the queue position of
+ * this state.
+ */
+ struct rx_superstate * next_recyclable;
+ struct rx_superstate * prev_recyclable;
+
+ /* The supernfa edges that exist in the cache and that have
+ * this state as their destination are kept in this list:
+ */
+ struct rx_distinct_future * transition_refs;
+
+ /* The list of nfa states corresponding to this superstate: */
+ struct rx_superset * contents;
+
+ /* The list of edges in the cache beginning from this state. */
+ struct rx_super_edge * edges;
+
+ /* A tail of the recyclable queue is marked as semifree. A semifree
+ * state has no incoming next_char transitions -- any transition
+ * into a semifree state causes a complex dispatch with the side
+ * effect of rescuing the state from its semifree state.
+ *
+ * An alternative to this might be to make next_char more expensive,
+ * and to move a state to the head of the recyclable queue whenever
+ * it is entered. That way, popular states would never be recycled.
+ *
+ * But unilaterally making next_char more expensive actually loses.
+ * So, incoming transitions are only made expensive for states near
+ * the tail of the recyclable queue. The more cache contention
+ * there is, the more frequently a state will have to prove itself
+ * and be moved back to the front of the queue. If there is less
+ * contention, then popular states just aggregate in the front of
+ * the queue and stay there.
+ */
+ int is_semifree;
+
+
+ /* This keeps track of the size of the transition table for this
+ * state. There is a half-hearted attempt to support variable sized
+ * superstates.
+ */
+ int trans_size;
+
+ /* Indexed by characters... */
+ struct rx_inx transitions[RX_TAIL_ARRAY];
+};
+
+
+/* A list of distinct futures define the edges that leave from a
+ * given superstate on a given character. c.f. rx_super_edge.
+ */
+
+struct rx_distinct_future
+{
+ struct rx_distinct_future * next_same_super_edge[2];
+ struct rx_distinct_future * next_same_dest;
+ struct rx_distinct_future * prev_same_dest;
+ struct rx_superstate * present; /* source state */
+ struct rx_superstate * future; /* destination state */
+ struct rx_super_edge * edge;
+
+
+ /* The future_frame holds the instruction that should be executed
+ * after all the side effects are done, when it is time to complete
+ * the transition to the next state.
+ *
+ * Normally this is a next_char instruction, but it may be a
+ * cache_miss instruction as well, depending on whether or not
+ * the superstate is in the cache and semifree.
+ *
+ * If this is the only future for a given superstate/char, and
+ * if there are no side effects to be performed, this frame is
+ * not used (directly) at all. Instead, its contents are copied
+ * into the transition table of the starting state of this dist. future.
+ */
+ struct rx_inx future_frame;
+
+ struct rx_inx side_effects_frame;
+ struct rx_se_list * effects;
+};
+
+#define rx_lock_superstate(R,S) ((S)->locks++)
+#define rx_unlock_superstate(R,S) (--(S)->locks)
+
+
+/* This page destined for rx.h */
+
+struct rx_blocklist
+{
+ struct rx_blocklist * next;
+ int bytes;
+};
+
+struct rx_freelist
+{
+ struct rx_freelist * next;
+};
+
+struct rx_cache;
+
+#ifdef __STDC__
+typedef void (*rx_morecore_fn)(struct rx_cache *);
+#else
+typedef void (*rx_morecore_fn)();
+#endif
+
+/* You use this to control the allocation of superstate data
+ * during matching. Most of it should be initialized to 0.
+ *
+ * A MORECORE function is necessary. It should allocate
+ * a new block of memory or return 0.
+ * A default that uses malloc is called `rx_morecore'.
+ *
+ * The number of SUPERSTATES_ALLOWED indirectly limits how much memory
+ * the system will try to allocate. The default is 128. Batch style
+ * applications that are very regexp intensive should use as high a number
+ * as possible without thrashing.
+ *
+ * The LOCAL_CSET_SIZE is the number of characters in a character set.
+ * It is therefore the number of entries in a superstate transition table.
+ * Generally, it should be 256. If your character set has 16 bits,
+ * it is better to translate your regexps into equivalent 8 bit patterns.
+ */
+
+struct rx_cache
+{
+ struct rx_hash_rules superset_hash_rules;
+
+ /* Objects are allocated by incrementing a pointer that
+ * scans across rx_blocklists.
+ */
+ struct rx_blocklist * memory;
+ struct rx_blocklist * memory_pos;
+ int bytes_left;
+ char * memory_addr;
+ rx_morecore_fn morecore;
+
+ /* Freelists. */
+ struct rx_freelist * free_superstates;
+ struct rx_freelist * free_transition_classes;
+ struct rx_freelist * free_discernable_futures;
+ struct rx_freelist * free_supersets;
+ struct rx_freelist * free_hash;
+
+ /* Two sets of superstates -- those that are semifreed, and those
+ * that are being used.
+ */
+ struct rx_superstate * lru_superstate;
+ struct rx_superstate * semifree_superstate;
+
+ struct rx_superset * empty_superset;
+
+ int superstates;
+ int semifree_superstates;
+ int hits;
+ int misses;
+ int superstates_allowed;
+
+ int local_cset_size;
+ void ** instruction_table;
+
+ struct rx_hash superset_table;
+};
+
+
+
+/* The lowest-level search function supports arbitrarily fragmented
+ * strings and (optionally) suspendable/resumable searches.
+ *
+ * Callers have to provide a few hooks.
+ */
+
+#ifndef __GNUC__
+#ifdef __STDC__
+#define __const__ const
+#else
+#define __const__
+#endif
+#endif
+
+/* This holds a matcher position */
+struct rx_string_position
+{
+ __const__ unsigned char * pos; /* The current pos. */
+ __const__ unsigned char * string; /* The current string burst. */
+ __const__ unsigned char * end; /* First invalid position >= POS. */
+ int offset; /* Integer address of the current burst. */
+ int size; /* Current string's size. */
+ int search_direction; /* 1 or -1 */
+ int search_end; /* First position to not try. */
+};
+
+
+enum rx_get_burst_return
+{
+ rx_get_burst_continuation,
+ rx_get_burst_error,
+ rx_get_burst_ok,
+ rx_get_burst_no_more
+};
+
+
+/* A call to get burst should make POS valid. It might be invalid
+ * if the STRING field doesn't point to a burst that actually
+ * contains POS.
+ *
+ * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to
+ * whether or not to pad to the left. Padding to the right is always
+ * appropriate, but need not go past the point indicated by STOP.
+ *
+ * If a continuation is returned, then the reentering call to
+ * a search function will retry the get_burst.
+ */
+
+#ifdef __STDC__
+typedef enum rx_get_burst_return
+ (*rx_get_burst_fn) (struct rx_string_position * pos,
+ void * app_closure,
+ int stop);
+
+#else
<