summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Andersen <andersen@codepoet.org>2000-10-20 03:48:11 +0000
committerEric Andersen <andersen@codepoet.org>2000-10-20 03:48:11 +0000
commit82d766043c6a8dcf6283788419f110dd7ab52f80 (patch)
tree09505131008d1b4d2178065878c3e8e0d54c26a2
parent5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff)
A smaller, kinder, gentler regexp implementation.
-rw-r--r--include/regex.h3770
-rw-r--r--include/regexp.h423
-rw-r--r--libc/misc/regex/Makefile2
-rw-r--r--libc/misc/regex/regex.c5725
-rw-r--r--libc/misc/regex/rx.c7273
5 files changed, 6215 insertions, 10978 deletions
diff --git a/include/regex.h b/include/regex.h
index 64a8de685..113a32e72 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -1,1332 +1,62 @@
-#if !defined(_RX_H) || defined(RX_WANT_SE_DEFS)
-#define _RX_H
+/* Definitions for data structures and routines for the regular
+ expression library, version 0.12.
+ Copyright (C) 1985,1989-1993,1995-1998,2000 Free Software Foundation, Inc.
-/* Copyright (C) 1992, 1993 Free Software Foundation, Inc.
+ This file is part of the GNU C Library. Its master source is NOT part of
+ the C library, however. The master source lives in /gd/gnu/lib.
-This file is part of the librx library.
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
-Librx is free software; you can redistribute it and/or modify it under
-the terms of the GNU Library General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
-Librx is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
-for more details.
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
-You should have received a copy of the GNU Library General Public
-License along with this software; see the file COPYING.LIB. If not,
-write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
-02139, USA. */
-/* t. lord Wed Sep 23 18:20:57 1992 */
+#ifndef _REGEX_H
+#define _REGEX_H 1
-
-
-
-#include <features.h>
-
-#define __need_size_t
-#include <stddef.h>
-
-#include <string.h>
-
-#if RX_WANT_SE_DEFS != 1
-__BEGIN_DECLS
-#endif
-
-#ifndef RX_WANT_SE_DEFS
-
-/* This page: Bitsets */
-
-#ifndef RX_subset
-typedef unsigned int RX_subset;
-#define RX_subset_bits (32)
-#define RX_subset_mask (RX_subset_bits - 1)
-#endif
-
-typedef RX_subset * rx_Bitset;
-
-#ifdef __STDC__
-typedef void (*rx_bitset_iterator) (void *, int member_index);
-#else
-typedef void (*rx_bitset_iterator) ();
-#endif
-
-#define rx_bitset_subset(N) ((N) / RX_subset_bits)
-#define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)])
-#define RX_bitset_access(B,N,OP) \
- ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
-#define RX_bitset_member(B,N) RX_bitset_access(B, N, &)
-#define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=)
-#define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~)
-#define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= )
-#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits)
-#define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset))
-
-
-
-/* This page: Splay trees. */
-
-#ifdef __STDC__
-typedef int (*rx_sp_comparer) (void * a, void * b);
-#else
-typedef int (*rx_sp_comparer) ();
+/* Allow the use in C++ code. */
+#ifdef __cplusplus
+extern "C" {
#endif
-struct rx_sp_node
-{
- void * key;
- void * data;
- struct rx_sp_node * kids[2];
-};
+/* POSIX says that <sys/types.h> must be included (by the caller) before
+ <regex.h>. */
-#ifdef __STDC__
-typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
-#else
-typedef void (*rx_sp_key_data_freer) ();
+#if !defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE && defined VMS
+/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it
+ should be there. */
+# include <stddef.h>
#endif
-
-/* giant inflatable hash trees */
-
-struct rx_hash_item
-{
- struct rx_hash_item * next_same_hash;
- struct rx_hash * table;
- unsigned long hash;
- void * data;
- void * binding;
-};
-
-struct rx_hash
-{
- struct rx_hash * parent;
- int refs;
- struct rx_hash * children[13];
- struct rx_hash_item * buckets [13];
- int bucket_size [13];
-};
-
-struct rx_hash_rules;
-
-#ifdef __STDC__
-/* should return like == */
-typedef int (*rx_hash_eq)(void *, void *);
-typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
-typedef void (*rx_free_hash)(struct rx_hash *,
- struct rx_hash_rules *);
-typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
- void *);
-typedef void (*rx_free_hash_item)(struct rx_hash_item *,
- struct rx_hash_rules *);
-#else
-typedef int (*rx_hash_eq)();
-typedef struct rx_hash * (*rx_alloc_hash)();
-typedef void (*rx_free_hash)();
-typedef struct rx_hash_item * (*rx_alloc_hash_item)();
-typedef void (*rx_free_hash_item)();
-#endif
-
-struct rx_hash_rules
-{
- rx_hash_eq eq;
- rx_alloc_hash hash_alloc;
- rx_free_hash free_hash;
- rx_alloc_hash_item hash_item_alloc;
- rx_free_hash_item free_hash_item;
-};
-
-
-/* Forward declarations */
-
-struct rx_cache;
-struct rx_superset;
-struct rx;
-struct rx_se_list;
-
-
-
-/*
- * GLOSSARY
- *
- * regexp
- * regular expression
- * expression
- * pattern - a `regular' expression. The expression
- * need not be formally regular -- it can contain
- * constructs that don't correspond to purely regular
- * expressions.
- *
- * buffer
- * string - the string (or strings) being searched or matched.
- *
- * pattern buffer - a structure of type `struct re_pattern_buffer'
- * This in turn contains a `struct rx', which holds the
- * NFA compiled from a pattern, as well as some of the state
- * of a matcher using the pattern.
- *
- * NFA - nondeterministic finite automata. Some people
- * use this term to a member of the class of
- * regular automata (those corresponding to a regular
- * language). However, in this code, the meaning is
- * more general. The automata used by Rx are comperable
- * in power to what are usually called `push down automata'.
- *
- * Two NFA are built by rx for every pattern. One is built
- * by the compiler. The other is built from the first, on
- * the fly, by the matcher. The latter is called the `superstate
- * NFA' because its states correspond to sets of states from
- * the first NFA. (Joe Keane gets credit for the name
- * `superstate NFA').
- *
- * NFA edges
- * epsilon edges
- * side-effect edges - The NFA compiled from a pattern can have three
- * kinds of edges. Epsilon edges can be taken freely anytime
- * their source state is reached. Character set edges can be
- * taken when their source state is reached and when the next
- * character in the buffer is a member of the set. Side effect
- * edges imply a transition that can only be taken after the
- * indicated side effect has been successfully accomplished.
- * Some examples of side effects are:
- *
- * Storing the current match position to record the
- * location of a parentesized subexpression.
- *
- * Advancing the matcher over N characters if they
- * match the N characters previously matched by a
- * parentesized subexpression.
- *
- * Both of those kinds of edges occur in the NFA generated
- * by the pattern: \(.\)\1
- *
- * Epsilon and side effect edges are similar. Unfortunately,
- * some of the code uses the name `epsilon edge' to mean
- * both epsilon and side effect edges. For example, the
- * function has_non_idempotent_epsilon_path computes the existance
- * of a non-trivial path containing only a mix of epsilon and
- * side effect edges. In that case `nonidempotent epsilon' is being
- * used to mean `side effect'.
- */
-
-
-
-
-
-/* LOW LEVEL PATTERN BUFFERS */
-
-/* Suppose that from some NFA state, more than one path through
- * side-effect edges is possible. In what order should the paths
- * be tried? A function of type rx_se_list_order answers that
- * question. It compares two lists of side effects, and says
- * which list comes first.
- */
-
-#ifdef __STDC__
-typedef int (*rx_se_list_order) (struct rx *,
- struct rx_se_list *,
- struct rx_se_list *);
-#else
-typedef int (*rx_se_list_order) ();
-#endif
-
-
-
-/* Struct RX holds a compiled regular expression - that is, an nfa
- * ready to be converted on demand to a more efficient superstate nfa.
- * This is for the low level interface. The high-level interfaces enclose
- * this in a `struct re_pattern_buffer'.
- */
-struct rx
-{
- /* The compiler assigns a unique id to every pattern.
- * Like sequence numbers in X, there is a subtle bug here
- * if you use Rx in a system that runs for a long time.
- * But, because of the way the caches work out, it is almost
- * impossible to trigger the Rx version of this bug.
- *
- * The id is used to validate superstates found in a cache
- * of superstates. It isn't sufficient to let a superstate
- * point back to the rx for which it was compiled -- the caller
- * may be re-using a `struct rx' in which case the superstate
- * is not really valid. So instead, superstates are validated
- * by checking the sequence number of the pattern for which
- * they were built.
- */
- int rx_id;
-
- /* This is memory mgt. state for superstates. This may be
- * shared by more than one struct rx.
- */
- struct rx_cache * cache;
-
- /* Every regex defines the size of its own character set.
- * A superstate has an array of this size, with each element
- * a `struct rx_inx'. So, don't make this number too large.
- * In particular, don't make it 2^16.
- */
- int local_cset_size;
-
- /* After the NFA is built, it is copied into a contiguous region
- * of memory (mostly for compatability with GNU regex).
- * Here is that region, and it's size:
- */
- void * buffer;
- unsigned long allocated;
-
- /* Clients of RX can ask for some extra storage in the space pointed
- * to by BUFFER. The field RESERVED is an input parameter to the
- * compiler. After compilation, this much space will be available
- * at (buffer + allocated - reserved)
- */
- unsigned long reserved;
-
- /* --------- The remaining fields are for internal use only. --------- */
- /* --------- But! they must be initialized to 0. --------- */
-
- /* NODEC is the number of nodes in the NFA with non-epsilon
- * transitions.
- */
- int nodec;
-
- /* EPSNODEC is the number of nodes with only epsilon transitions. */
- int epsnodec;
-
- /* The sum (NODEC + EPSNODEC) is the total number of states in the
- * compiled NFA.
- */
-
- /* Lists of side effects as stored in the NFA are `hash consed'..meaning
- * that lists with the same elements are ==. During compilation,
- * this table facilitates hash-consing.
- */
- struct rx_hash se_list_memo;
-
- /* Lists of NFA states are also hashed.
- */
- struct rx_hash set_list_memo;
-
-
-
-
- /* The compiler and matcher must build a number of instruction frames.
- * The format of these frames is fixed (c.f. struct rx_inx). The values
- * of the instructions is not fixed.
- *
- * An enumerated type (enum rx_opcode) defines the set of instructions
- * that the compiler or matcher might generate. When filling an instruction
- * frame, the INX field is found by indexing this instruction table
- * with an opcode:
- */
- void ** instruction_table;
-
- /* The list of all states in an NFA.
- * During compilation, the NEXT field of NFA states links this list.
- * After compilation, all the states are compacted into an array,
- * ordered by state id numbers. At that time, this points to the base
- * of that array.
- */
- struct rx_nfa_state *nfa_states;
-
- /* Every nfa begins with one distinguished starting state:
- */
- struct rx_nfa_state *start;
-
- /* This orders the search through super-nfa paths.
- * See the comment near the typedef of rx_se_list_order.
- */
- rx_se_list_order se_list_cmp;
-
- struct rx_superset * start_set;
-};
-
-
-
-
-/* SYNTAX TREES */
-
-/* Compilation is in stages.
- *
- * In the first stage, a pattern specified by a string is
- * translated into a syntax tree. Later stages will convert
- * the syntax tree into an NFA optimized for conversion to a
- * superstate-NFA.
- *
- * This page is about syntax trees.
- */
-
-enum rexp_node_type
-{
- r_cset, /* Match from a character set. `a' or `[a-z]'*/
- r_concat, /* Concat two subexpressions. `ab' */
- r_alternate, /* Choose one of two subexpressions. `a\|b' */
- r_opt, /* Optional subexpression. `a?' */
- r_star, /* Repeated subexpression. `a*' */
-
-
- /* A 2phase-star is a variation on a repeated subexpression.
- * In this case, there are two subexpressions. The first, if matched,
- * begins a repitition (otherwise, the whole expression is matches the
- * empth string).
- *
- * After matching the first subexpression, a 2phase star either finishes,
- * or matches the second subexpression. If the second subexpression is
- * matched, then the whole construct repeats.
- *
- * 2phase stars are used in two circumstances. First, they
- * are used as part of the implementation of POSIX intervals (counted
- * repititions). Second, they are used to implement proper star
- * semantics when the repeated subexpression contains paths of
- * only side effects. See rx_compile for more information.
- */
- r_2phase_star,
-
-
- /* c.f. "typedef void * rx_side_effect" */
- r_side_effect,
-
- /* This is an extension type: It is for transient use in source->source
- * transformations (implemented over syntax trees).
- */
- r_data
-};
-
-/* A side effect is a matcher-specific action associated with
- * transitions in the NFA. The details of side effects are up
- * to the matcher. To the compiler and superstate constructors
- * side effects are opaque:
- */
-
-typedef void * rx_side_effect;
-
-/* Nodes in a syntax tree are of this type:
- */
-struct rexp_node
-{
- enum rexp_node_type type;
- union
- {
- rx_Bitset cset;
- rx_side_effect side_effect;
- struct
- {
- struct rexp_node *left;
- struct rexp_node *right;
- } pair;
- void * data;
- } params;
-};
-
-
-
-/* NFA
- *
- * A syntax tree is compiled into an NFA. This page defines the structure
- * of that NFA.
- */
-
-struct rx_nfa_state
-{
- /* These are kept in a list as the NFA is being built. */
- struct rx_nfa_state *next;
-
- /* After the NFA is built, states are given integer id's.
- * States whose outgoing transitions are all either epsilon or
- * side effect edges are given ids less than 0. Other states
- * are given successive non-negative ids starting from 0.
- */
- int id;
-
- /* The list of NFA edges that go from this state to some other. */
- struct rx_nfa_edge *edges;
-
- /* If you land in this state, then you implicitly land
- * in all other states reachable by only epsilon translations.
- * Call the set of maximal paths to such states the epsilon closure
- * of this state.
- *
- * There may be other states that are reachable by a mixture of
- * epsilon and side effect edges. Consider the set of maximal paths
- * of that sort from this state. Call it the epsilon-side-effect
- * closure of the state.
- *
- * The epsilon closure of the state is a subset of the epsilon-side-
- * effect closure. It consists of all the paths that contain
- * no side effects -- only epsilon edges.
- *
- * The paths in the epsilon-side-effect closure can be partitioned
- * into equivalance sets. Two paths are equivalant if they have the
- * same set of side effects, in the same order. The epsilon-closure
- * is one of these equivalance sets. Let's call these equivalance
- * sets: observably equivalant path sets. That name is chosen
- * because equivalance of two paths means they cause the same side
- * effects -- so they lead to the same subsequent observations other
- * than that they may wind up in different target states.
- *
- * The superstate nfa, which is derived from this nfa, is based on
- * the observation that all of the paths in an observably equivalant
- * path set can be explored at the same time, provided that the
- * matcher keeps track not of a single nfa state, but of a set of
- * states. In particular, after following all the paths in an
- * observably equivalant set, you wind up at a set of target states.
- * That set of target states corresponds to one state in the
- * superstate NFA.
- *
- * Staticly, before matching begins, it is convenient to analyze the
- * nfa. Each state is labeled with a list of the observably
- * equivalant path sets who's union covers all the
- * epsilon-side-effect paths beginning in this state. This list is
- * called the possible futures of the state.
- *
- * A trivial example is this NFA:
- * s1
- * A ---> B
- *
- * s2
- * ---> C
- *
- * epsilon s1
- * ---------> D ------> E
- *
- *
- * In this example, A has two possible futures.
- * One invokes the side effect `s1' and contains two paths,
- * one ending in state B, the other in state E.
- * The other invokes the side effect `s2' and contains only
- * one path, landing in state C.
- */
- struct rx_possible_future *futures;
-
-
- /* There are exactly two distinguished states in every NFA: */
- unsigned int is_final:1;
- unsigned int is_start:1;
-
- /* These are used during NFA construction... */
- unsigned int eclosure_needed:1;
- unsigned int mark:1;
-};
-
-
-/* An edge in an NFA is typed: */
-enum rx_nfa_etype
-{
- /* A cset edge is labled with a set of characters one of which
- * must be matched for the edge to be taken.
- */
- ne_cset,
-
- /* An epsilon edge is taken whenever its starting state is
- * reached.
- */
- ne_epsilon,
-
- /* A side effect edge is taken whenever its starting state is
- * reached. Side effects may cause the match to fail or the
- * position of the matcher to advance.
- */
- ne_side_effect /* A special kind of epsilon. */
-};
-
-struct rx_nfa_edge
-{
- struct rx_nfa_edge *next;
- enum rx_nfa_etype type;
- struct rx_nfa_state *dest;
- union
- {
- rx_Bitset cset;
- rx_side_effect side_effect;
- } params;
-};
-
-
-
-/* A possible future consists of a list of side effects
- * and a set of destination states. Below are their
- * representations. These structures are hash-consed which
- * means that lists with the same elements share a representation
- * (their addresses are ==).
- */
-
-struct rx_nfa_state_set
-{
- struct rx_nfa_state * car;
- struct rx_nfa_state_set * cdr;
-};
-
-struct rx_se_list
-{
- rx_side_effect car;
- struct rx_se_list * cdr;
-};
-
-struct rx_possible_future
-{
- struct rx_possible_future *next;
- struct rx_se_list * effects;
- struct rx_nfa_state_set * destset;
-};
-
-
-
-/* This begins the description of the superstate NFA.
- *
- * The superstate NFA corresponds to the NFA in these ways:
- *
- * Every superstate NFA states SUPER correspond to sets of NFA states,
- * nfa_states(SUPER).
- *
- * Superstate edges correspond to NFA paths.
- *
- * The superstate has no epsilon transitions;
- * every edge has a character label, and a (possibly empty) side
- * effect label. The side effect label corresponds to a list of
- * side effects that occur in the NFA. These parts are referred
- * to as: superedge_character(EDGE) and superedge_sides(EDGE).
- *
- * For a superstate edge EDGE starting in some superstate SUPER,
- * the following is true (in pseudo-notation :-):
- *
- * exists DEST in nfa_states s.t.
- * exists nfaEDGE in nfa_edges s.t.
- * origin (nfaEDGE) == DEST
- * && origin (nfaEDGE) is a member of nfa_states(SUPER)
- * && exists PF in possible_futures(dest(nfaEDGE)) s.t.
- * sides_of_possible_future (PF) == superedge_sides (EDGE)
- *
- * also:
- *
- * let SUPER2 := superedge_destination(EDGE)
- * nfa_states(SUPER2)
- * == union of all nfa state sets S s.t.
- * exists PF in possible_futures(dest(nfaEDGE)) s.t.
- * sides_of_possible_future (PF) == superedge_sides (EDGE)
- * && S == dests_of_possible_future (PF) }
- *
- * Or in english, every superstate is a set of nfa states. A given
- * character and a superstate implies many transitions in the NFA --
- * those that begin with an edge labeled with that character from a
- * state in the set corresponding to the superstate.
- *
- * The destinations of those transitions each have a set of possible
- * futures. A possible future is a list of side effects and a set of
- * destination NFA states. Two sets of possible futures can be
- * `merged' by combining all pairs of possible futures that have the
- * same side effects. A pair is combined by creating a new future
- * with the same side effect but the union of the two destination sets.
- * In this way, all the possible futures suggested by a superstate
- * and a character can be merged into a set of possible futures where
- * no two elements of the set have the same set of side effects.
- *
- * The destination of a possible future, being a set of NFA states,
- * corresponds to a supernfa state. So, the merged set of possible
- * futures we just created can serve as a set of edges in the
- * supernfa.
- *
- * The representation of the superstate nfa and the nfa is critical.
- * The nfa has to be compact, but has to facilitate the rapid
- * computation of missing superstates. The superstate nfa has to
- * be fast to interpret, lazilly constructed, and bounded in space.
- *
- * To facilitate interpretation, the superstate data structures are
- * peppered with `instruction frames'. There is an instruction set
- * defined below which matchers using the supernfa must be able to
- * interpret.
- *
- * We'd like to make it possible but not mandatory to use code
- * addresses to represent instructions (c.f. gcc's computed goto).
- * Therefore, we define an enumerated type of opcodes, and when
- * writing one of these instructions into a data structure, use
- * the opcode as an index into a table of instruction values.
- *
- * Here are the opcodes that occur in the superstate nfa:
- */
-
-
-/* Every superstate contains a table of instruction frames indexed
- * by characters. A normal `move' in a matcher is to fetch the next
- * character and use it as an index into a superstates transition
- * table.
- *
- * In the fasted case, only one edge follows from that character.
- * In other cases there is more work to do.
- *
- * The descriptions of the opcodes refer to data structures that are
- * described further below.
- */
-
-enum rx_opcode
-{
- /*
- * BACKTRACK_POINT is invoked when a character transition in
- * a superstate leads to more than one edge. In that case,
- * the edges have to be explored independently using a backtracking
- * strategy.
- *
- * A BACKTRACK_POINT instruction is stored in a superstate's
- * transition table for some character when it is known that that
- * character crosses more than one edge. On encountering this
- * instruction, the matcher saves enough state to backtrack to this
- * point in the match later.
- */
- rx_backtrack_point = 0, /* data is (struct transition_class *) */
-
- /*
- * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
- * There is one occurence of this instruction per rx_distinct_future.
- * This instruction is skipped if a rx_distinct_future has no side effects.
- */
- rx_do_side_effects = rx_backtrack_point + 1,
-
- /* data is (struct rx_distinct_future *) */
-
- /*
- * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
- * destination superstate has been reclaimed (or was never built).
- * It recomputes the destination superstate.
- * RX_CACHE_MISS is also stored in a superstate transition table before
- * any of its edges have been built.
- */
- rx_cache_miss = rx_do_side_effects + 1,
- /* data is (struct rx_distinct_future *) */
-
- /*
- * RX_NEXT_CHAR is called to consume the next character and take the
- * corresponding transition. This is the only instruction that uses
- * the DATA field of the instruction frame instead of DATA_2.
- * (see EXPLORE_FUTURE in regex.c).
- */
- rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
-
- /* RX_BACKTRACK indicates that a transition fails.
- */
- rx_backtrack = rx_next_char + 1, /* no data */
-
- /*
- * RX_ERROR_INX is stored only in places that should never be executed.
- */
- rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
-
- rx_num_instructions = rx_error_inx + 1
-};
-
-/* An id_instruction_table holds the values stored in instruction
- * frames. The table is indexed by the enums declared above.
- */
-extern void * rx_id_instruction_table[rx_num_instructions];
-
-/* The heart of the matcher is a `word-code-interpreter'
- * (like a byte-code interpreter, except that instructions
- * are a full word wide).
- *
- * Instructions are not stored in a vector of code, instead,
- * they are scattered throughout the data structures built
- * by the regexp compiler and the matcher. One word-code instruction,
- * together with the arguments to that instruction, constitute
- * an instruction frame (struct rx_inx).
- *
- * This structure type is padded by hand to a power of 2 because
- * in one of the dominant cases, we dispatch by indexing a table
- * of instruction frames. If that indexing can be accomplished
- * by just a shift of the index, we're happy.
- *
- * Instructions take at most one argument, but there are two
- * slots in an instruction frame that might hold that argument.
- * These are called data and data_2. The data slot is only
- * used for one instruction (RX_NEXT_CHAR). For all other
- * instructions, data should be set to 0.
- *
- * RX_NEXT_CHAR is the most important instruction by far.
- * By reserving the data field for its exclusive use,
- * instruction dispatch is sped up in that case. There is
- * no need to fetch both the instruction and the data,
- * only the data is needed. In other words, a `cycle' begins
- * by fetching the field data. If that is non-0, then it must
- * be the destination state of a next_char transition, so
- * make that value the current state, advance the match position
- * by one character, and start a new cycle. On the other hand,
- * if data is 0, fetch the instruction and do a more complicated
- * dispatch on that.
- */
-
-struct rx_inx
-{
- void * data;
- void * data_2;
- void * inx;
- void * fnord;
-};
-
-#ifndef RX_TAIL_ARRAY
-#define RX_TAIL_ARRAY 1
-#endif
-
-/* A superstate corresponds to a set of nfa states. Those sets are
- * represented by STRUCT RX_SUPERSET. The constructors
- * guarantee that only one (shared) structure is created for a given set.
- */
-struct rx_superset
-{
- int refs; /* This is a reference counted structure. */
-
- /* We keep these sets in a cache because (in an unpredictable way),
- * the same set is often created again and again. But that is also
- * problematic -- compatibility with POSIX and GNU regex requires
- * that we not be able to tell when a program discards a particular
- * NFA (thus invalidating the supersets created from it).
- *
- * But when a cache hit appears to occur, we will have in hand the
- * nfa for which it may have happened. That is why every nfa is given
- * its own sequence number. On a cache hit, the cache is validated
- * by comparing the nfa sequence number to this field:
- */
- int id;
-
- struct rx_nfa_state * car; /* May or may not be a valid addr. */
- struct rx_superset * cdr;
-
- /* If the corresponding superstate exists: */
- struct rx_superstate * superstate;
-
-
- /* There is another bookkeeping problem. It is expensive to
- * compute the starting nfa state set for an nfa. So, once computed,
- * it is cached in the `struct rx'.
- *
- * But, the state set can be flushed from the superstate cache.
- * When that happens, we can't know if the corresponding `struct rx'
- * is still alive or if it has been freed or re-used by the program.
- * So, the cached pointer to this set in a struct rx might be invalid
- * and we need a way to validate it.
- *
- * Fortunately, even if this set is flushed from the cache, it is
- * not freed. It just goes on the free-list of supersets.
- * So we can still examine it.
- *
- * So to validate a starting set memo, check to see if the
- * starts_for field still points back to the struct rx in question,
- * and if the ID matches the rx sequence number.
- */
- struct rx * starts_for;
-
- /* This is used to link into a hash bucket so these objects can
- * be `hash-consed'.
- */
- struct rx_hash_item hash_item;
-};
-
-#define rx_protect_superset(RX,CON) (++(CON)->refs)
-
-/* The terminology may be confusing (rename this structure?).
- * Every character occurs in at most one rx_super_edge per super-state.
- * But, that structure might have more than one option, indicating a point
- * of non-determinism.
- *
- * In other words, this structure holds a list of superstate edges
- * sharing a common starting state and character label. The edges
- * are in the field OPTIONS. All superstate edges sharing the same
- * starting state and character are in this list.
- */
-struct rx_super_edge
-{
- struct rx_super_edge *next;
- struct rx_inx rx_backtrack_frame;
- int cset_size;
- rx_Bitset cset;
- struct rx_distinct_future *options;
-};
-
-/* A superstate is a set of nfa states (RX_SUPERSET) along
- * with a transition table. Superstates are built on demand and reclaimed
- * without warning. To protect a superstate from this ghastly fate,
- * use LOCK_SUPERSTATE.
- */
-struct rx_superstate
-{
- int rx_id; /* c.f. the id field of rx_superset */
- int locks; /* protection from reclamation */
-
- /* Within a superstate cache, all the superstates are kept in a big
- * queue. The tail of the queue is the state most likely to be
- * reclaimed. The *recyclable fields hold the queue position of
- * this state.
- */
- struct rx_superstate * next_recyclable;
- struct rx_superstate * prev_recyclable;
-
- /* The supernfa edges that exist in the cache and that have
- * this state as their destination are kept in this list:
- */
- struct rx_distinct_future * transition_refs;
-
- /* The list of nfa states corresponding to this superstate: */
- struct rx_superset * contents;
-
- /* The list of edges in the cache beginning from this state. */
- struct rx_super_edge * edges;
-
- /* A tail of the recyclable queue is marked as semifree. A semifree
- * state has no incoming next_char transitions -- any transition
- * into a semifree state causes a complex dispatch with the side
- * effect of rescuing the state from its semifree state.
- *
- * An alternative to this might be to make next_char more expensive,
- * and to move a state to the head of the recyclable queue whenever
- * it is entered. That way, popular states would never be recycled.
- *
- * But unilaterally making next_char more expensive actually loses.
- * So, incoming transitions are only made expensive for states near
- * the tail of the recyclable queue. The more cache contention
- * there is, the more frequently a state will have to prove itself
- * and be moved back to the front of the queue. If there is less
- * contention, then popular states just aggregate in the front of
- * the queue and stay there.
- */
- int is_semifree;
-
-
- /* This keeps track of the size of the transition table for this
- * state. There is a half-hearted attempt to support variable sized
- * superstates.
- */
- int trans_size;
-
- /* Indexed by characters... */
- struct rx_inx transitions[RX_TAIL_ARRAY];
-};
-
-
-/* A list of distinct futures define the edges that leave from a
- * given superstate on a given character. c.f. rx_super_edge.
- */
-
-struct rx_distinct_future
-{
- struct rx_distinct_future * next_same_super_edge[2];
- struct rx_distinct_future * next_same_dest;
- struct rx_distinct_future * prev_same_dest;
- struct rx_superstate * present; /* source state */
- struct rx_superstate * future; /* destination state */
- struct rx_super_edge * edge;
-
-
- /* The future_frame holds the instruction that should be executed
- * after all the side effects are done, when it is time to complete
- * the transition to the next state.
- *
- * Normally this is a next_char instruction, but it may be a
- * cache_miss instruction as well, depending on whether or not
- * the superstate is in the cache and semifree.
- *
- * If this is the only future for a given superstate/char, and
- * if there are no side effects to be performed, this frame is
- * not used (directly) at all. Instead, its contents are copied
- * into the transition table of the starting state of this dist. future.
- */
- struct rx_inx future_frame;
-
- struct rx_inx side_effects_frame;
- struct rx_se_list * effects;
-};
-
-#define rx_lock_superstate(R,S) ((S)->locks++)
-#define rx_unlock_superstate(R,S) (--(S)->locks)
-
-
-/* This page destined for rx.h */
-
-struct rx_blocklist
-{
- struct rx_blocklist * next;
- int bytes;
-};
-
-struct rx_freelist
-{
- struct rx_freelist * next;
-};
-
-struct rx_cache;
-
-#ifdef __STDC__
-typedef void (*rx_morecore_fn)(struct rx_cache *);
-#else
-typedef void (*rx_morecore_fn)();
-#endif
-
-/* You use this to control the allocation of superstate data
- * during matching. Most of it should be initialized to 0.
- *
- * A MORECORE function is necessary. It should allocate
- * a new block of memory or return 0.
- * A default that uses malloc is called `rx_morecore'.
- *
- * The number of SUPERSTATES_ALLOWED indirectly limits how much memory
- * the system will try to allocate. The default is 128. Batch style
- * applications that are very regexp intensive should use as high a number
- * as possible without thrashing.
- *
- * The LOCAL_CSET_SIZE is the number of characters in a character set.
- * It is therefore the number of entries in a superstate transition table.
- * Generally, it should be 256. If your character set has 16 bits,
- * it is better to translate your regexps into equivalent 8 bit patterns.
- */
-
-struct rx_cache
-{
- struct rx_hash_rules superset_hash_rules;
-
- /* Objects are allocated by incrementing a pointer that