diff options
author | Eric Andersen <andersen@codepoet.org> | 2000-10-20 03:48:11 +0000 |
---|---|---|
committer | Eric Andersen <andersen@codepoet.org> | 2000-10-20 03:48:11 +0000 |
commit | 82d766043c6a8dcf6283788419f110dd7ab52f80 (patch) | |
tree | 09505131008d1b4d2178065878c3e8e0d54c26a2 | |
parent | 5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff) |
A smaller, kinder, gentler regexp implementation.
-rw-r--r-- | include/regex.h | 3770 | ||||
-rw-r--r-- | include/regexp.h | 423 | ||||
-rw-r--r-- | libc/misc/regex/Makefile | 2 | ||||
-rw-r--r-- | libc/misc/regex/regex.c | 5725 | ||||
-rw-r--r-- | libc/misc/regex/rx.c | 7273 |
5 files changed, 6215 insertions, 10978 deletions
diff --git a/include/regex.h b/include/regex.h index 64a8de685..113a32e72 100644 --- a/include/regex.h +++ b/include/regex.h @@ -1,1332 +1,62 @@ -#if !defined(_RX_H) || defined(RX_WANT_SE_DEFS) -#define _RX_H +/* Definitions for data structures and routines for the regular + expression library, version 0.12. + Copyright (C) 1985,1989-1993,1995-1998,2000 Free Software Foundation, Inc. -/* Copyright (C) 1992, 1993 Free Software Foundation, Inc. + This file is part of the GNU C Library. Its master source is NOT part of + the C library, however. The master source lives in /gd/gnu/lib. -This file is part of the librx library. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. -Librx is free software; you can redistribute it and/or modify it under -the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. -Librx is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ -You should have received a copy of the GNU Library General Public -License along with this software; see the file COPYING.LIB. If not, -write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA -02139, USA. */ -/* t. lord Wed Sep 23 18:20:57 1992 */ +#ifndef _REGEX_H +#define _REGEX_H 1 - - - -#include <features.h> - -#define __need_size_t -#include <stddef.h> - -#include <string.h> - -#if RX_WANT_SE_DEFS != 1 -__BEGIN_DECLS -#endif - -#ifndef RX_WANT_SE_DEFS - -/* This page: Bitsets */ - -#ifndef RX_subset -typedef unsigned int RX_subset; -#define RX_subset_bits (32) -#define RX_subset_mask (RX_subset_bits - 1) -#endif - -typedef RX_subset * rx_Bitset; - -#ifdef __STDC__ -typedef void (*rx_bitset_iterator) (void *, int member_index); -#else -typedef void (*rx_bitset_iterator) (); -#endif - -#define rx_bitset_subset(N) ((N) / RX_subset_bits) -#define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)]) -#define RX_bitset_access(B,N,OP) \ - ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask]) -#define RX_bitset_member(B,N) RX_bitset_access(B, N, &) -#define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=) -#define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~) -#define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= ) -#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits) -#define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset)) - - - -/* This page: Splay trees. */ - -#ifdef __STDC__ -typedef int (*rx_sp_comparer) (void * a, void * b); -#else -typedef int (*rx_sp_comparer) (); +/* Allow the use in C++ code. */ +#ifdef __cplusplus +extern "C" { #endif -struct rx_sp_node -{ - void * key; - void * data; - struct rx_sp_node * kids[2]; -}; +/* POSIX says that <sys/types.h> must be included (by the caller) before + <regex.h>. */ -#ifdef __STDC__ -typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *); -#else -typedef void (*rx_sp_key_data_freer) (); +#if !defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE && defined VMS +/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it + should be there. */ +# include <stddef.h> #endif - -/* giant inflatable hash trees */ - -struct rx_hash_item -{ - struct rx_hash_item * next_same_hash; - struct rx_hash * table; - unsigned long hash; - void * data; - void * binding; -}; - -struct rx_hash -{ - struct rx_hash * parent; - int refs; - struct rx_hash * children[13]; - struct rx_hash_item * buckets [13]; - int bucket_size [13]; -}; - -struct rx_hash_rules; - -#ifdef __STDC__ -/* should return like == */ -typedef int (*rx_hash_eq)(void *, void *); -typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *); -typedef void (*rx_free_hash)(struct rx_hash *, - struct rx_hash_rules *); -typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *, - void *); -typedef void (*rx_free_hash_item)(struct rx_hash_item *, - struct rx_hash_rules *); -#else -typedef int (*rx_hash_eq)(); -typedef struct rx_hash * (*rx_alloc_hash)(); -typedef void (*rx_free_hash)(); -typedef struct rx_hash_item * (*rx_alloc_hash_item)(); -typedef void (*rx_free_hash_item)(); -#endif - -struct rx_hash_rules -{ - rx_hash_eq eq; - rx_alloc_hash hash_alloc; - rx_free_hash free_hash; - rx_alloc_hash_item hash_item_alloc; - rx_free_hash_item free_hash_item; -}; - - -/* Forward declarations */ - -struct rx_cache; -struct rx_superset; -struct rx; -struct rx_se_list; - - - -/* - * GLOSSARY - * - * regexp - * regular expression - * expression - * pattern - a `regular' expression. The expression - * need not be formally regular -- it can contain - * constructs that don't correspond to purely regular - * expressions. - * - * buffer - * string - the string (or strings) being searched or matched. - * - * pattern buffer - a structure of type `struct re_pattern_buffer' - * This in turn contains a `struct rx', which holds the - * NFA compiled from a pattern, as well as some of the state - * of a matcher using the pattern. - * - * NFA - nondeterministic finite automata. Some people - * use this term to a member of the class of - * regular automata (those corresponding to a regular - * language). However, in this code, the meaning is - * more general. The automata used by Rx are comperable - * in power to what are usually called `push down automata'. - * - * Two NFA are built by rx for every pattern. One is built - * by the compiler. The other is built from the first, on - * the fly, by the matcher. The latter is called the `superstate - * NFA' because its states correspond to sets of states from - * the first NFA. (Joe Keane gets credit for the name - * `superstate NFA'). - * - * NFA edges - * epsilon edges - * side-effect edges - The NFA compiled from a pattern can have three - * kinds of edges. Epsilon edges can be taken freely anytime - * their source state is reached. Character set edges can be - * taken when their source state is reached and when the next - * character in the buffer is a member of the set. Side effect - * edges imply a transition that can only be taken after the - * indicated side effect has been successfully accomplished. - * Some examples of side effects are: - * - * Storing the current match position to record the - * location of a parentesized subexpression. - * - * Advancing the matcher over N characters if they - * match the N characters previously matched by a - * parentesized subexpression. - * - * Both of those kinds of edges occur in the NFA generated - * by the pattern: \(.\)\1 - * - * Epsilon and side effect edges are similar. Unfortunately, - * some of the code uses the name `epsilon edge' to mean - * both epsilon and side effect edges. For example, the - * function has_non_idempotent_epsilon_path computes the existance - * of a non-trivial path containing only a mix of epsilon and - * side effect edges. In that case `nonidempotent epsilon' is being - * used to mean `side effect'. - */ - - - - - -/* LOW LEVEL PATTERN BUFFERS */ - -/* Suppose that from some NFA state, more than one path through - * side-effect edges is possible. In what order should the paths - * be tried? A function of type rx_se_list_order answers that - * question. It compares two lists of side effects, and says - * which list comes first. - */ - -#ifdef __STDC__ -typedef int (*rx_se_list_order) (struct rx *, - struct rx_se_list *, - struct rx_se_list *); -#else -typedef int (*rx_se_list_order) (); -#endif - - - -/* Struct RX holds a compiled regular expression - that is, an nfa - * ready to be converted on demand to a more efficient superstate nfa. - * This is for the low level interface. The high-level interfaces enclose - * this in a `struct re_pattern_buffer'. - */ -struct rx -{ - /* The compiler assigns a unique id to every pattern. - * Like sequence numbers in X, there is a subtle bug here - * if you use Rx in a system that runs for a long time. - * But, because of the way the caches work out, it is almost - * impossible to trigger the Rx version of this bug. - * - * The id is used to validate superstates found in a cache - * of superstates. It isn't sufficient to let a superstate - * point back to the rx for which it was compiled -- the caller - * may be re-using a `struct rx' in which case the superstate - * is not really valid. So instead, superstates are validated - * by checking the sequence number of the pattern for which - * they were built. - */ - int rx_id; - - /* This is memory mgt. state for superstates. This may be - * shared by more than one struct rx. - */ - struct rx_cache * cache; - - /* Every regex defines the size of its own character set. - * A superstate has an array of this size, with each element - * a `struct rx_inx'. So, don't make this number too large. - * In particular, don't make it 2^16. - */ - int local_cset_size; - - /* After the NFA is built, it is copied into a contiguous region - * of memory (mostly for compatability with GNU regex). - * Here is that region, and it's size: - */ - void * buffer; - unsigned long allocated; - - /* Clients of RX can ask for some extra storage in the space pointed - * to by BUFFER. The field RESERVED is an input parameter to the - * compiler. After compilation, this much space will be available - * at (buffer + allocated - reserved) - */ - unsigned long reserved; - - /* --------- The remaining fields are for internal use only. --------- */ - /* --------- But! they must be initialized to 0. --------- */ - - /* NODEC is the number of nodes in the NFA with non-epsilon - * transitions. - */ - int nodec; - - /* EPSNODEC is the number of nodes with only epsilon transitions. */ - int epsnodec; - - /* The sum (NODEC + EPSNODEC) is the total number of states in the - * compiled NFA. - */ - - /* Lists of side effects as stored in the NFA are `hash consed'..meaning - * that lists with the same elements are ==. During compilation, - * this table facilitates hash-consing. - */ - struct rx_hash se_list_memo; - - /* Lists of NFA states are also hashed. - */ - struct rx_hash set_list_memo; - - - - - /* The compiler and matcher must build a number of instruction frames. - * The format of these frames is fixed (c.f. struct rx_inx). The values - * of the instructions is not fixed. - * - * An enumerated type (enum rx_opcode) defines the set of instructions - * that the compiler or matcher might generate. When filling an instruction - * frame, the INX field is found by indexing this instruction table - * with an opcode: - */ - void ** instruction_table; - - /* The list of all states in an NFA. - * During compilation, the NEXT field of NFA states links this list. - * After compilation, all the states are compacted into an array, - * ordered by state id numbers. At that time, this points to the base - * of that array. - */ - struct rx_nfa_state *nfa_states; - - /* Every nfa begins with one distinguished starting state: - */ - struct rx_nfa_state *start; - - /* This orders the search through super-nfa paths. - * See the comment near the typedef of rx_se_list_order. - */ - rx_se_list_order se_list_cmp; - - struct rx_superset * start_set; -}; - - - - -/* SYNTAX TREES */ - -/* Compilation is in stages. - * - * In the first stage, a pattern specified by a string is - * translated into a syntax tree. Later stages will convert - * the syntax tree into an NFA optimized for conversion to a - * superstate-NFA. - * - * This page is about syntax trees. - */ - -enum rexp_node_type -{ - r_cset, /* Match from a character set. `a' or `[a-z]'*/ - r_concat, /* Concat two subexpressions. `ab' */ - r_alternate, /* Choose one of two subexpressions. `a\|b' */ - r_opt, /* Optional subexpression. `a?' */ - r_star, /* Repeated subexpression. `a*' */ - - - /* A 2phase-star is a variation on a repeated subexpression. - * In this case, there are two subexpressions. The first, if matched, - * begins a repitition (otherwise, the whole expression is matches the - * empth string). - * - * After matching the first subexpression, a 2phase star either finishes, - * or matches the second subexpression. If the second subexpression is - * matched, then the whole construct repeats. - * - * 2phase stars are used in two circumstances. First, they - * are used as part of the implementation of POSIX intervals (counted - * repititions). Second, they are used to implement proper star - * semantics when the repeated subexpression contains paths of - * only side effects. See rx_compile for more information. - */ - r_2phase_star, - - - /* c.f. "typedef void * rx_side_effect" */ - r_side_effect, - - /* This is an extension type: It is for transient use in source->source - * transformations (implemented over syntax trees). - */ - r_data -}; - -/* A side effect is a matcher-specific action associated with - * transitions in the NFA. The details of side effects are up - * to the matcher. To the compiler and superstate constructors - * side effects are opaque: - */ - -typedef void * rx_side_effect; - -/* Nodes in a syntax tree are of this type: - */ -struct rexp_node -{ - enum rexp_node_type type; - union - { - rx_Bitset cset; - rx_side_effect side_effect; - struct - { - struct rexp_node *left; - struct rexp_node *right; - } pair; - void * data; - } params; -}; - - - -/* NFA - * - * A syntax tree is compiled into an NFA. This page defines the structure - * of that NFA. - */ - -struct rx_nfa_state -{ - /* These are kept in a list as the NFA is being built. */ - struct rx_nfa_state *next; - - /* After the NFA is built, states are given integer id's. - * States whose outgoing transitions are all either epsilon or - * side effect edges are given ids less than 0. Other states - * are given successive non-negative ids starting from 0. - */ - int id; - - /* The list of NFA edges that go from this state to some other. */ - struct rx_nfa_edge *edges; - - /* If you land in this state, then you implicitly land - * in all other states reachable by only epsilon translations. - * Call the set of maximal paths to such states the epsilon closure - * of this state. - * - * There may be other states that are reachable by a mixture of - * epsilon and side effect edges. Consider the set of maximal paths - * of that sort from this state. Call it the epsilon-side-effect - * closure of the state. - * - * The epsilon closure of the state is a subset of the epsilon-side- - * effect closure. It consists of all the paths that contain - * no side effects -- only epsilon edges. - * - * The paths in the epsilon-side-effect closure can be partitioned - * into equivalance sets. Two paths are equivalant if they have the - * same set of side effects, in the same order. The epsilon-closure - * is one of these equivalance sets. Let's call these equivalance - * sets: observably equivalant path sets. That name is chosen - * because equivalance of two paths means they cause the same side - * effects -- so they lead to the same subsequent observations other - * than that they may wind up in different target states. - * - * The superstate nfa, which is derived from this nfa, is based on - * the observation that all of the paths in an observably equivalant - * path set can be explored at the same time, provided that the - * matcher keeps track not of a single nfa state, but of a set of - * states. In particular, after following all the paths in an - * observably equivalant set, you wind up at a set of target states. - * That set of target states corresponds to one state in the - * superstate NFA. - * - * Staticly, before matching begins, it is convenient to analyze the - * nfa. Each state is labeled with a list of the observably - * equivalant path sets who's union covers all the - * epsilon-side-effect paths beginning in this state. This list is - * called the possible futures of the state. - * - * A trivial example is this NFA: - * s1 - * A ---> B - * - * s2 - * ---> C - * - * epsilon s1 - * ---------> D ------> E - * - * - * In this example, A has two possible futures. - * One invokes the side effect `s1' and contains two paths, - * one ending in state B, the other in state E. - * The other invokes the side effect `s2' and contains only - * one path, landing in state C. - */ - struct rx_possible_future *futures; - - - /* There are exactly two distinguished states in every NFA: */ - unsigned int is_final:1; - unsigned int is_start:1; - - /* These are used during NFA construction... */ - unsigned int eclosure_needed:1; - unsigned int mark:1; -}; - - -/* An edge in an NFA is typed: */ -enum rx_nfa_etype -{ - /* A cset edge is labled with a set of characters one of which - * must be matched for the edge to be taken. - */ - ne_cset, - - /* An epsilon edge is taken whenever its starting state is - * reached. - */ - ne_epsilon, - - /* A side effect edge is taken whenever its starting state is - * reached. Side effects may cause the match to fail or the - * position of the matcher to advance. - */ - ne_side_effect /* A special kind of epsilon. */ -}; - -struct rx_nfa_edge -{ - struct rx_nfa_edge *next; - enum rx_nfa_etype type; - struct rx_nfa_state *dest; - union - { - rx_Bitset cset; - rx_side_effect side_effect; - } params; -}; - - - -/* A possible future consists of a list of side effects - * and a set of destination states. Below are their - * representations. These structures are hash-consed which - * means that lists with the same elements share a representation - * (their addresses are ==). - */ - -struct rx_nfa_state_set -{ - struct rx_nfa_state * car; - struct rx_nfa_state_set * cdr; -}; - -struct rx_se_list -{ - rx_side_effect car; - struct rx_se_list * cdr; -}; - -struct rx_possible_future -{ - struct rx_possible_future *next; - struct rx_se_list * effects; - struct rx_nfa_state_set * destset; -}; - - - -/* This begins the description of the superstate NFA. - * - * The superstate NFA corresponds to the NFA in these ways: - * - * Every superstate NFA states SUPER correspond to sets of NFA states, - * nfa_states(SUPER). - * - * Superstate edges correspond to NFA paths. - * - * The superstate has no epsilon transitions; - * every edge has a character label, and a (possibly empty) side - * effect label. The side effect label corresponds to a list of - * side effects that occur in the NFA. These parts are referred - * to as: superedge_character(EDGE) and superedge_sides(EDGE). - * - * For a superstate edge EDGE starting in some superstate SUPER, - * the following is true (in pseudo-notation :-): - * - * exists DEST in nfa_states s.t. - * exists nfaEDGE in nfa_edges s.t. - * origin (nfaEDGE) == DEST - * && origin (nfaEDGE) is a member of nfa_states(SUPER) - * && exists PF in possible_futures(dest(nfaEDGE)) s.t. - * sides_of_possible_future (PF) == superedge_sides (EDGE) - * - * also: - * - * let SUPER2 := superedge_destination(EDGE) - * nfa_states(SUPER2) - * == union of all nfa state sets S s.t. - * exists PF in possible_futures(dest(nfaEDGE)) s.t. - * sides_of_possible_future (PF) == superedge_sides (EDGE) - * && S == dests_of_possible_future (PF) } - * - * Or in english, every superstate is a set of nfa states. A given - * character and a superstate implies many transitions in the NFA -- - * those that begin with an edge labeled with that character from a - * state in the set corresponding to the superstate. - * - * The destinations of those transitions each have a set of possible - * futures. A possible future is a list of side effects and a set of - * destination NFA states. Two sets of possible futures can be - * `merged' by combining all pairs of possible futures that have the - * same side effects. A pair is combined by creating a new future - * with the same side effect but the union of the two destination sets. - * In this way, all the possible futures suggested by a superstate - * and a character can be merged into a set of possible futures where - * no two elements of the set have the same set of side effects. - * - * The destination of a possible future, being a set of NFA states, - * corresponds to a supernfa state. So, the merged set of possible - * futures we just created can serve as a set of edges in the - * supernfa. - * - * The representation of the superstate nfa and the nfa is critical. - * The nfa has to be compact, but has to facilitate the rapid - * computation of missing superstates. The superstate nfa has to - * be fast to interpret, lazilly constructed, and bounded in space. - * - * To facilitate interpretation, the superstate data structures are - * peppered with `instruction frames'. There is an instruction set - * defined below which matchers using the supernfa must be able to - * interpret. - * - * We'd like to make it possible but not mandatory to use code - * addresses to represent instructions (c.f. gcc's computed goto). - * Therefore, we define an enumerated type of opcodes, and when - * writing one of these instructions into a data structure, use - * the opcode as an index into a table of instruction values. - * - * Here are the opcodes that occur in the superstate nfa: - */ - - -/* Every superstate contains a table of instruction frames indexed - * by characters. A normal `move' in a matcher is to fetch the next - * character and use it as an index into a superstates transition - * table. - * - * In the fasted case, only one edge follows from that character. - * In other cases there is more work to do. - * - * The descriptions of the opcodes refer to data structures that are - * described further below. - */ - -enum rx_opcode -{ - /* - * BACKTRACK_POINT is invoked when a character transition in - * a superstate leads to more than one edge. In that case, - * the edges have to be explored independently using a backtracking - * strategy. - * - * A BACKTRACK_POINT instruction is stored in a superstate's - * transition table for some character when it is known that that - * character crosses more than one edge. On encountering this - * instruction, the matcher saves enough state to backtrack to this - * point in the match later. - */ - rx_backtrack_point = 0, /* data is (struct transition_class *) */ - - /* - * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. - * There is one occurence of this instruction per rx_distinct_future. - * This instruction is skipped if a rx_distinct_future has no side effects. - */ - rx_do_side_effects = rx_backtrack_point + 1, - - /* data is (struct rx_distinct_future *) */ - - /* - * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose - * destination superstate has been reclaimed (or was never built). - * It recomputes the destination superstate. - * RX_CACHE_MISS is also stored in a superstate transition table before - * any of its edges have been built. - */ - rx_cache_miss = rx_do_side_effects + 1, - /* data is (struct rx_distinct_future *) */ - - /* - * RX_NEXT_CHAR is called to consume the next character and take the - * corresponding transition. This is the only instruction that uses - * the DATA field of the instruction frame instead of DATA_2. - * (see EXPLORE_FUTURE in regex.c). - */ - rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ - - /* RX_BACKTRACK indicates that a transition fails. - */ - rx_backtrack = rx_next_char + 1, /* no data */ - - /* - * RX_ERROR_INX is stored only in places that should never be executed. - */ - rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ - - rx_num_instructions = rx_error_inx + 1 -}; - -/* An id_instruction_table holds the values stored in instruction - * frames. The table is indexed by the enums declared above. - */ -extern void * rx_id_instruction_table[rx_num_instructions]; - -/* The heart of the matcher is a `word-code-interpreter' - * (like a byte-code interpreter, except that instructions - * are a full word wide). - * - * Instructions are not stored in a vector of code, instead, - * they are scattered throughout the data structures built - * by the regexp compiler and the matcher. One word-code instruction, - * together with the arguments to that instruction, constitute - * an instruction frame (struct rx_inx). - * - * This structure type is padded by hand to a power of 2 because - * in one of the dominant cases, we dispatch by indexing a table - * of instruction frames. If that indexing can be accomplished - * by just a shift of the index, we're happy. - * - * Instructions take at most one argument, but there are two - * slots in an instruction frame that might hold that argument. - * These are called data and data_2. The data slot is only - * used for one instruction (RX_NEXT_CHAR). For all other - * instructions, data should be set to 0. - * - * RX_NEXT_CHAR is the most important instruction by far. - * By reserving the data field for its exclusive use, - * instruction dispatch is sped up in that case. There is - * no need to fetch both the instruction and the data, - * only the data is needed. In other words, a `cycle' begins - * by fetching the field data. If that is non-0, then it must - * be the destination state of a next_char transition, so - * make that value the current state, advance the match position - * by one character, and start a new cycle. On the other hand, - * if data is 0, fetch the instruction and do a more complicated - * dispatch on that. - */ - -struct rx_inx -{ - void * data; - void * data_2; - void * inx; - void * fnord; -}; - -#ifndef RX_TAIL_ARRAY -#define RX_TAIL_ARRAY 1 -#endif - -/* A superstate corresponds to a set of nfa states. Those sets are - * represented by STRUCT RX_SUPERSET. The constructors - * guarantee that only one (shared) structure is created for a given set. - */ -struct rx_superset -{ - int refs; /* This is a reference counted structure. */ - - /* We keep these sets in a cache because (in an unpredictable way), - * the same set is often created again and again. But that is also - * problematic -- compatibility with POSIX and GNU regex requires - * that we not be able to tell when a program discards a particular - * NFA (thus invalidating the supersets created from it). - * - * But when a cache hit appears to occur, we will have in hand the - * nfa for which it may have happened. That is why every nfa is given - * its own sequence number. On a cache hit, the cache is validated - * by comparing the nfa sequence number to this field: - */ - int id; - - struct rx_nfa_state * car; /* May or may not be a valid addr. */ - struct rx_superset * cdr; - - /* If the corresponding superstate exists: */ - struct rx_superstate * superstate; - - - /* There is another bookkeeping problem. It is expensive to - * compute the starting nfa state set for an nfa. So, once computed, - * it is cached in the `struct rx'. - * - * But, the state set can be flushed from the superstate cache. - * When that happens, we can't know if the corresponding `struct rx' - * is still alive or if it has been freed or re-used by the program. - * So, the cached pointer to this set in a struct rx might be invalid - * and we need a way to validate it. - * - * Fortunately, even if this set is flushed from the cache, it is - * not freed. It just goes on the free-list of supersets. - * So we can still examine it. - * - * So to validate a starting set memo, check to see if the - * starts_for field still points back to the struct rx in question, - * and if the ID matches the rx sequence number. - */ - struct rx * starts_for; - - /* This is used to link into a hash bucket so these objects can - * be `hash-consed'. - */ - struct rx_hash_item hash_item; -}; - -#define rx_protect_superset(RX,CON) (++(CON)->refs) - -/* The terminology may be confusing (rename this structure?). - * Every character occurs in at most one rx_super_edge per super-state. - * But, that structure might have more than one option, indicating a point - * of non-determinism. - * - * In other words, this structure holds a list of superstate edges - * sharing a common starting state and character label. The edges - * are in the field OPTIONS. All superstate edges sharing the same - * starting state and character are in this list. - */ -struct rx_super_edge -{ - struct rx_super_edge *next; - struct rx_inx rx_backtrack_frame; - int cset_size; - rx_Bitset cset; - struct rx_distinct_future *options; -}; - -/* A superstate is a set of nfa states (RX_SUPERSET) along - * with a transition table. Superstates are built on demand and reclaimed - * without warning. To protect a superstate from this ghastly fate, - * use LOCK_SUPERSTATE. - */ -struct rx_superstate -{ - int rx_id; /* c.f. the id field of rx_superset */ - int locks; /* protection from reclamation */ - - /* Within a superstate cache, all the superstates are kept in a big - * queue. The tail of the queue is the state most likely to be - * reclaimed. The *recyclable fields hold the queue position of - * this state. - */ - struct rx_superstate * next_recyclable; - struct rx_superstate * prev_recyclable; - - /* The supernfa edges that exist in the cache and that have - * this state as their destination are kept in this list: - */ - struct rx_distinct_future * transition_refs; - - /* The list of nfa states corresponding to this superstate: */ - struct rx_superset * contents; - - /* The list of edges in the cache beginning from this state. */ - struct rx_super_edge * edges; - - /* A tail of the recyclable queue is marked as semifree. A semifree - * state has no incoming next_char transitions -- any transition - * into a semifree state causes a complex dispatch with the side - * effect of rescuing the state from its semifree state. - * - * An alternative to this might be to make next_char more expensive, - * and to move a state to the head of the recyclable queue whenever - * it is entered. That way, popular states would never be recycled. - * - * But unilaterally making next_char more expensive actually loses. - * So, incoming transitions are only made expensive for states near - * the tail of the recyclable queue. The more cache contention - * there is, the more frequently a state will have to prove itself - * and be moved back to the front of the queue. If there is less - * contention, then popular states just aggregate in the front of - * the queue and stay there. - */ - int is_semifree; - - - /* This keeps track of the size of the transition table for this - * state. There is a half-hearted attempt to support variable sized - * superstates. - */ - int trans_size; - - /* Indexed by characters... */ - struct rx_inx transitions[RX_TAIL_ARRAY]; -}; - - -/* A list of distinct futures define the edges that leave from a - * given superstate on a given character. c.f. rx_super_edge. - */ - -struct rx_distinct_future -{ - struct rx_distinct_future * next_same_super_edge[2]; - struct rx_distinct_future * next_same_dest; - struct rx_distinct_future * prev_same_dest; - struct rx_superstate * present; /* source state */ - struct rx_superstate * future; /* destination state */ - struct rx_super_edge * edge; - - - /* The future_frame holds the instruction that should be executed - * after all the side effects are done, when it is time to complete - * the transition to the next state. - * - * Normally this is a next_char instruction, but it may be a - * cache_miss instruction as well, depending on whether or not - * the superstate is in the cache and semifree. - * - * If this is the only future for a given superstate/char, and - * if there are no side effects to be performed, this frame is - * not used (directly) at all. Instead, its contents are copied - * into the transition table of the starting state of this dist. future. - */ - struct rx_inx future_frame; - - struct rx_inx side_effects_frame; - struct rx_se_list * effects; -}; - -#define rx_lock_superstate(R,S) ((S)->locks++) -#define rx_unlock_superstate(R,S) (--(S)->locks) - - -/* This page destined for rx.h */ - -struct rx_blocklist -{ - struct rx_blocklist * next; - int bytes; -}; - -struct rx_freelist -{ - struct rx_freelist * next; -}; - -struct rx_cache; - -#ifdef __STDC__ -typedef void (*rx_morecore_fn)(struct rx_cache *); -#else -typedef void (*rx_morecore_fn)(); -#endif - -/* You use this to control the allocation of superstate data - * during matching. Most of it should be initialized to 0. - * - * A MORECORE function is necessary. It should allocate - * a new block of memory or return 0. - * A default that uses malloc is called `rx_morecore'. - * - * The number of SUPERSTATES_ALLOWED indirectly limits how much memory - * the system will try to allocate. The default is 128. Batch style - * applications that are very regexp intensive should use as high a number - * as possible without thrashing. - * - * The LOCAL_CSET_SIZE is the number of characters in a character set. - * It is therefore the number of entries in a superstate transition table. - * Generally, it should be 256. If your character set has 16 bits, - * it is better to translate your regexps into equivalent 8 bit patterns. - */ - -struct rx_cache -{ - struct rx_hash_rules superset_hash_rules; - - /* Objects are allocated by incrementing a pointer that |