diff options
Diffstat (limited to 'include/regex.h')
-rw-r--r-- | include/regex.h | 3754 |
1 files changed, 3754 insertions, 0 deletions
diff --git a/include/regex.h b/include/regex.h new file mode 100644 index 000000000..813882c42 --- /dev/null +++ b/include/regex.h @@ -0,0 +1,3754 @@ +#if !defined(_RX_H) || defined(RX_WANT_SE_DEFS) +#define _RX_H + +/* Copyright (C) 1992, 1993 Free Software Foundation, Inc. + +This file is part of the librx library. + +Librx is free software; you can redistribute it and/or modify it under +the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +Librx is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU Library General Public +License along with this software; see the file COPYING.LIB. If not, +write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA +02139, USA. */ +/* t. lord Wed Sep 23 18:20:57 1992 */ + + + + +#include <features.h> + +#define __need_size_t +#include <stddef.h> + +#include <string.h> + +#if RX_WANT_SE_DEFS != 1 +__BEGIN_DECLS +#endif + +#ifndef RX_WANT_SE_DEFS + +/* This page: Bitsets */ + +#ifndef RX_subset +typedef unsigned int RX_subset; +#define RX_subset_bits (32) +#define RX_subset_mask (RX_subset_bits - 1) +#endif + +typedef RX_subset * rx_Bitset; + +#ifdef __STDC__ +typedef void (*rx_bitset_iterator) (void *, int member_index); +#else +typedef void (*rx_bitset_iterator) (); +#endif + +#define rx_bitset_subset(N) ((N) / RX_subset_bits) +#define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)]) +#define RX_bitset_access(B,N,OP) \ + ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask]) +#define RX_bitset_member(B,N) RX_bitset_access(B, N, &) +#define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=) +#define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~) +#define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= ) +#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits) +#define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset)) + + + +/* This page: Splay trees. */ + +#ifdef __STDC__ +typedef int (*rx_sp_comparer) (void * a, void * b); +#else +typedef int (*rx_sp_comparer) (); +#endif + +struct rx_sp_node +{ + void * key; + void * data; + struct rx_sp_node * kids[2]; +}; + +#ifdef __STDC__ +typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *); +#else +typedef void (*rx_sp_key_data_freer) (); +#endif + + +/* giant inflatable hash trees */ + +struct rx_hash_item +{ + struct rx_hash_item * next_same_hash; + struct rx_hash * table; + unsigned long hash; + void * data; + void * binding; +}; + +struct rx_hash +{ + struct rx_hash * parent; + int refs; + struct rx_hash * children[13]; + struct rx_hash_item * buckets [13]; + int bucket_size [13]; +}; + +struct rx_hash_rules; + +#ifdef __STDC__ +/* should return like == */ +typedef int (*rx_hash_eq)(void *, void *); +typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *); +typedef void (*rx_free_hash)(struct rx_hash *, + struct rx_hash_rules *); +typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *, + void *); +typedef void (*rx_free_hash_item)(struct rx_hash_item *, + struct rx_hash_rules *); +#else +typedef int (*rx_hash_eq)(); +typedef struct rx_hash * (*rx_alloc_hash)(); +typedef void (*rx_free_hash)(); +typedef struct rx_hash_item * (*rx_alloc_hash_item)(); +typedef void (*rx_free_hash_item)(); +#endif + +struct rx_hash_rules +{ + rx_hash_eq eq; + rx_alloc_hash hash_alloc; + rx_free_hash free_hash; + rx_alloc_hash_item hash_item_alloc; + rx_free_hash_item free_hash_item; +}; + + +/* Forward declarations */ + +struct rx_cache; +struct rx_superset; +struct rx; +struct rx_se_list; + + + +/* + * GLOSSARY + * + * regexp + * regular expression + * expression + * pattern - a `regular' expression. The expression + * need not be formally regular -- it can contain + * constructs that don't correspond to purely regular + * expressions. + * + * buffer + * string - the string (or strings) being searched or matched. + * + * pattern buffer - a structure of type `struct re_pattern_buffer' + * This in turn contains a `struct rx', which holds the + * NFA compiled from a pattern, as well as some of the state + * of a matcher using the pattern. + * + * NFA - nondeterministic finite automata. Some people + * use this term to a member of the class of + * regular automata (those corresponding to a regular + * language). However, in this code, the meaning is + * more general. The automata used by Rx are comperable + * in power to what are usually called `push down automata'. + * + * Two NFA are built by rx for every pattern. One is built + * by the compiler. The other is built from the first, on + * the fly, by the matcher. The latter is called the `superstate + * NFA' because its states correspond to sets of states from + * the first NFA. (Joe Keane gets credit for the name + * `superstate NFA'). + * + * NFA edges + * epsilon edges + * side-effect edges - The NFA compiled from a pattern can have three + * kinds of edges. Epsilon edges can be taken freely anytime + * their source state is reached. Character set edges can be + * taken when their source state is reached and when the next + * character in the buffer is a member of the set. Side effect + * edges imply a transition that can only be taken after the + * indicated side effect has been successfully accomplished. + * Some examples of side effects are: + * + * Storing the current match position to record the + * location of a parentesized subexpression. + * + * Advancing the matcher over N characters if they + * match the N characters previously matched by a + * parentesized subexpression. + * + * Both of those kinds of edges occur in the NFA generated + * by the pattern: \(.\)\1 + * + * Epsilon and side effect edges are similar. Unfortunately, + * some of the code uses the name `epsilon edge' to mean + * both epsilon and side effect edges. For example, the + * function has_non_idempotent_epsilon_path computes the existance + * of a non-trivial path containing only a mix of epsilon and + * side effect edges. In that case `nonidempotent epsilon' is being + * used to mean `side effect'. + */ + + + + + +/* LOW LEVEL PATTERN BUFFERS */ + +/* Suppose that from some NFA state, more than one path through + * side-effect edges is possible. In what order should the paths + * be tried? A function of type rx_se_list_order answers that + * question. It compares two lists of side effects, and says + * which list comes first. + */ + +#ifdef __STDC__ +typedef int (*rx_se_list_order) (struct rx *, + struct rx_se_list *, + struct rx_se_list *); +#else +typedef int (*rx_se_list_order) (); +#endif + + + +/* Struct RX holds a compiled regular expression - that is, an nfa + * ready to be converted on demand to a more efficient superstate nfa. + * This is for the low level interface. The high-level interfaces enclose + * this in a `struct re_pattern_buffer'. + */ +struct rx +{ + /* The compiler assigns a unique id to every pattern. + * Like sequence numbers in X, there is a subtle bug here + * if you use Rx in a system that runs for a long time. + * But, because of the way the caches work out, it is almost + * impossible to trigger the Rx version of this bug. + * + * The id is used to validate superstates found in a cache + * of superstates. It isn't sufficient to let a superstate + * point back to the rx for which it was compiled -- the caller + * may be re-using a `struct rx' in which case the superstate + * is not really valid. So instead, superstates are validated + * by checking the sequence number of the pattern for which + * they were built. + */ + int rx_id; + + /* This is memory mgt. state for superstates. This may be + * shared by more than one struct rx. + */ + struct rx_cache * cache; + + /* Every regex defines the size of its own character set. + * A superstate has an array of this size, with each element + * a `struct rx_inx'. So, don't make this number too large. + * In particular, don't make it 2^16. + */ + int local_cset_size; + + /* After the NFA is built, it is copied into a contiguous region + * of memory (mostly for compatability with GNU regex). + * Here is that region, and it's size: + */ + void * buffer; + unsigned long allocated; + + /* Clients of RX can ask for some extra storage in the space pointed + * to by BUFFER. The field RESERVED is an input parameter to the + * compiler. After compilation, this much space will be available + * at (buffer + allocated - reserved) + */ + unsigned long reserved; + + /* --------- The remaining fields are for internal use only. --------- */ + /* --------- But! they must be initialized to 0. --------- */ + + /* NODEC is the number of nodes in the NFA with non-epsilon + * transitions. + */ + int nodec; + + /* EPSNODEC is the number of nodes with only epsilon transitions. */ + int epsnodec; + + /* The sum (NODEC + EPSNODEC) is the total number of states in the + * compiled NFA. + */ + + /* Lists of side effects as stored in the NFA are `hash consed'..meaning + * that lists with the same elements are ==. During compilation, + * this table facilitates hash-consing. + */ + struct rx_hash se_list_memo; + + /* Lists of NFA states are also hashed. + */ + struct rx_hash set_list_memo; + + + + + /* The compiler and matcher must build a number of instruction frames. + * The format of these frames is fixed (c.f. struct rx_inx). The values + * of the instructions is not fixed. + * + * An enumerated type (enum rx_opcode) defines the set of instructions + * that the compiler or matcher might generate. When filling an instruction + * frame, the INX field is found by indexing this instruction table + * with an opcode: + */ + void ** instruction_table; + + /* The list of all states in an NFA. + * During compilation, the NEXT field of NFA states links this list. + * After compilation, all the states are compacted into an array, + * ordered by state id numbers. At that time, this points to the base + * of that array. + */ + struct rx_nfa_state *nfa_states; + + /* Every nfa begins with one distinguished starting state: + */ + struct rx_nfa_state *start; + + /* This orders the search through super-nfa paths. + * See the comment near the typedef of rx_se_list_order. + */ + rx_se_list_order se_list_cmp; + + struct rx_superset * start_set; +}; + + + + +/* SYNTAX TREES */ + +/* Compilation is in stages. + * + * In the first stage, a pattern specified by a string is + * translated into a syntax tree. Later stages will convert + * the syntax tree into an NFA optimized for conversion to a + * superstate-NFA. + * + * This page is about syntax trees. + */ + +enum rexp_node_type +{ + r_cset, /* Match from a character set. `a' or `[a-z]'*/ + r_concat, /* Concat two subexpressions. `ab' */ + r_alternate, /* Choose one of two subexpressions. `a\|b' */ + r_opt, /* Optional subexpression. `a?' */ + r_star, /* Repeated subexpression. `a*' */ + + + /* A 2phase-star is a variation on a repeated subexpression. + * In this case, there are two subexpressions. The first, if matched, + * begins a repitition (otherwise, the whole expression is matches the + * empth string). + * + * After matching the first subexpression, a 2phase star either finishes, + * or matches the second subexpression. If the second subexpression is + * matched, then the whole construct repeats. + * + * 2phase stars are used in two circumstances. First, they + * are used as part of the implementation of POSIX intervals (counted + * repititions). Second, they are used to implement proper star + * semantics when the repeated subexpression contains paths of + * only side effects. See rx_compile for more information. + */ + r_2phase_star, + + + /* c.f. "typedef void * rx_side_effect" */ + r_side_effect, + + /* This is an extension type: It is for transient use in source->source + * transformations (implemented over syntax trees). + */ + r_data +}; + +/* A side effect is a matcher-specific action associated with + * transitions in the NFA. The details of side effects are up + * to the matcher. To the compiler and superstate constructors + * side effects are opaque: + */ + +typedef void * rx_side_effect; + +/* Nodes in a syntax tree are of this type: + */ +struct rexp_node +{ + enum rexp_node_type type; + union + { + rx_Bitset cset; + rx_side_effect side_effect; + struct + { + struct rexp_node *left; + struct rexp_node *right; + } pair; + void * data; + } params; +}; + + + +/* NFA + * + * A syntax tree is compiled into an NFA. This page defines the structure + * of that NFA. + */ + +struct rx_nfa_state +{ + /* These are kept in a list as the NFA is being built. */ + struct rx_nfa_state *next; + + /* After the NFA is built, states are given integer id's. + * States whose outgoing transitions are all either epsilon or + * side effect edges are given ids less than 0. Other states + * are given successive non-negative ids starting from 0. + */ + int id; + + /* The list of NFA edges that go from this state to some other. */ + struct rx_nfa_edge *edges; + + /* If you land in this state, then you implicitly land + * in all other states reachable by only epsilon translations. + * Call the set of maximal paths to such states the epsilon closure + * of this state. + * + * There may be other states that are reachable by a mixture of + * epsilon and side effect edges. Consider the set of maximal paths + * of that sort from this state. Call it the epsilon-side-effect + * closure of the state. + * + * The epsilon closure of the state is a subset of the epsilon-side- + * effect closure. It consists of all the paths that contain + * no side effects -- only epsilon edges. + * + * The paths in the epsilon-side-effect closure can be partitioned + * into equivalance sets. Two paths are equivalant if they have the + * same set of side effects, in the same order. The epsilon-closure + * is one of these equivalance sets. Let's call these equivalance + * sets: observably equivalant path sets. That name is chosen + * because equivalance of two paths means they cause the same side + * effects -- so they lead to the same subsequent observations other + * than that they may wind up in different target states. + * + * The superstate nfa, which is derived from this nfa, is based on + * the observation that all of the paths in an observably equivalant + * path set can be explored at the same time, provided that the + * matcher keeps track not of a single nfa state, but of a set of + * states. In particular, after following all the paths in an + * observably equivalant set, you wind up at a set of target states. + * That set of target states corresponds to one state in the + * superstate NFA. + * + * Staticly, before matching begins, it is convenient to analyze the + * nfa. Each state is labeled with a list of the observably + * equivalant path sets who's union covers all the + * epsilon-side-effect paths beginning in this state. This list is + * called the possible futures of the state. + * + * A trivial example is this NFA: + * s1 + * A ---> B + * + * s2 + * ---> C + * + * epsilon s1 + * ---------> D ------> E + * + * + * In this example, A has two possible futures. + * One invokes the side effect `s1' and contains two paths, + * one ending in state B, the other in state E. + * The other invokes the side effect `s2' and contains only + * one path, landing in state C. + */ + struct rx_possible_future *futures; + + + /* There are exactly two distinguished states in every NFA: */ + unsigned int is_final:1; + unsigned int is_start:1; + + /* These are used during NFA construction... */ + unsigned int eclosure_needed:1; + unsigned int mark:1; +}; + + +/* An edge in an NFA is typed: */ +enum rx_nfa_etype +{ + /* A cset edge is labled with a set of characters one of which + * must be matched for the edge to be taken. + */ + ne_cset, + + /* An epsilon edge is taken whenever its starting state is + * reached. + */ + ne_epsilon, + + /* A side effect edge is taken whenever its starting state is + * reached. Side effects may cause the match to fail or the + * position of the matcher to advance. + */ + ne_side_effect /* A special kind of epsilon. */ +}; + +struct rx_nfa_edge +{ + struct rx_nfa_edge *next; + enum rx_nfa_etype type; + struct rx_nfa_state *dest; + union + { + rx_Bitset cset; + rx_side_effect side_effect; + } params; +}; + + + +/* A possible future consists of a list of side effects + * and a set of destination states. Below are their + * representations. These structures are hash-consed which + * means that lists with the same elements share a representation + * (their addresses are ==). + */ + +struct rx_nfa_state_set +{ + struct rx_nfa_state * car; + struct rx_nfa_state_set * cdr; +}; + +struct rx_se_list +{ + rx_side_effect car; + struct rx_se_list * cdr; +}; + +struct rx_possible_future +{ + struct rx_possible_future *next; + struct rx_se_list * effects; + struct rx_nfa_state_set * destset; +}; + + + +/* This begins the description of the superstate NFA. + * + * The superstate NFA corresponds to the NFA in these ways: + * + * Every superstate NFA states SUPER correspond to sets of NFA states, + * nfa_states(SUPER). + * + * Superstate edges correspond to NFA paths. + * + * The superstate has no epsilon transitions; + * every edge has a character label, and a (possibly empty) side + * effect label. The side effect label corresponds to a list of + * side effects that occur in the NFA. These parts are referred + * to as: superedge_character(EDGE) and superedge_sides(EDGE). + * + * For a superstate edge EDGE starting in some superstate SUPER, + * the following is true (in pseudo-notation :-): + * + * exists DEST in nfa_states s.t. + * exists nfaEDGE in nfa_edges s.t. + * origin (nfaEDGE) == DEST + * && origin (nfaEDGE) is a member of nfa_states(SUPER) + * && exists PF in possible_futures(dest(nfaEDGE)) s.t. + * sides_of_possible_future (PF) == superedge_sides (EDGE) + * + * also: + * + * let SUPER2 := superedge_destination(EDGE) + * nfa_states(SUPER2) + * == union of all nfa state sets S s.t. + * exists PF in possible_futures(dest(nfaEDGE)) s.t. + * sides_of_possible_future (PF) == superedge_sides (EDGE) + * && S == dests_of_possible_future (PF) } + * + * Or in english, every superstate is a set of nfa states. A given + * character and a superstate implies many transitions in the NFA -- + * those that begin with an edge labeled with that character from a + * state in the set corresponding to the superstate. + * + * The destinations of those transitions each have a set of possible + * futures. A possible future is a list of side effects and a set of + * destination NFA states. Two sets of possible futures can be + * `merged' by combining all pairs of possible futures that have the + * same side effects. A pair is combined by creating a new future + * with the same side effect but the union of the two destination sets. + * In this way, all the possible futures suggested by a superstate + * and a character can be merged into a set of possible futures where + * no two elements of the set have the same set of side effects. + * + * The destination of a possible future, being a set of NFA states, + * corresponds to a supernfa state. So, the merged set of possible + * futures we just created can serve as a set of edges in the + * supernfa. + * + * The representation of the superstate nfa and the nfa is critical. + * The nfa has to be compact, but has to facilitate the rapid + * computation of missing superstates. The superstate nfa has to + * be fast to interpret, lazilly constructed, and bounded in space. + * + * To facilitate interpretation, the superstate data structures are + * peppered with `instruction frames'. There is an instruction set + * defined below which matchers using the supernfa must be able to + * interpret. + * + * We'd like to make it possible but not mandatory to use code + * addresses to represent instructions (c.f. gcc's computed goto). + * Therefore, we define an enumerated type of opcodes, and when + * writing one of these instructions into a data structure, use + * the opcode as an index into a table of instruction values. + * + * Here are the opcodes that occur in the superstate nfa: + */ + + +/* Every superstate contains a table of instruction frames indexed + * by characters. A normal `move' in a matcher is to fetch the next + * character and use it as an index into a superstates transition + * table. + * + * In the fasted case, only one edge follows from that character. + * In other cases there is more work to do. + * + * The descriptions of the opcodes refer to data structures that are + * described further below. + */ + +enum rx_opcode +{ + /* + * BACKTRACK_POINT is invoked when a character transition in + * a superstate leads to more than one edge. In that case, + * the edges have to be explored independently using a backtracking + * strategy. + * + * A BACKTRACK_POINT instruction is stored in a superstate's + * transition table for some character when it is known that that + * character crosses more than one edge. On encountering this + * instruction, the matcher saves enough state to backtrack to this + * point in the match later. + */ + rx_backtrack_point = 0, /* data is (struct transition_class *) */ + + /* + * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. + * There is one occurence of this instruction per rx_distinct_future. + * This instruction is skipped if a rx_distinct_future has no side effects. + */ + rx_do_side_effects = rx_backtrack_point + 1, + + /* data is (struct rx_distinct_future *) */ + + /* + * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose + * destination superstate has been reclaimed (or was never built). + * It recomputes the destination superstate. + * RX_CACHE_MISS is also stored in a superstate transition table before + * any of its edges have been built. + */ + rx_cache_miss = rx_do_side_effects + 1, + /* data is (struct rx_distinct_future *) */ + + /* + * RX_NEXT_CHAR is called to consume the next character and take the + * corresponding transition. This is the only instruction that uses + * the DATA field of the instruction frame instead of DATA_2. + * (see EXPLORE_FUTURE in regex.c). + */ + rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ + + /* RX_BACKTRACK indicates that a transition fails. + */ + rx_backtrack = rx_next_char + 1, /* no data */ + + /* + * RX_ERROR_INX is stored only in places that should never be executed. + */ + rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ + + rx_num_instructions = rx_error_inx + 1 +}; + +/* An id_instruction_table holds the values stored in instruction + * frames. The table is indexed by the enums declared above. + */ +extern void * rx_id_instruction_table[rx_num_instructions]; + +/* The heart of the matcher is a `word-code-interpreter' + * (like a byte-code interpreter, except that instructions + * are a full word wide). + * + * Instructions are not stored in a vector of code, instead, + * they are scattered throughout the data structures built + * by the regexp compiler and the matcher. One word-code instruction, + * together with the arguments to that instruction, constitute + * an instruction frame (struct rx_inx). + * + * This structure type is padded by hand to a power of 2 because + * in one of the dominant cases, we dispatch by indexing a table + * of instruction frames. If that indexing can be accomplished + * by just a shift of the index, we're happy. + * + * Instructions take at most one argument, but there are two + * slots in an instruction frame that might hold that argument. + * These are called data and data_2. The data slot is only + * used for one instruction (RX_NEXT_CHAR). For all other + * instructions, data should be set to 0. + * + * RX_NEXT_CHAR is the most important instruction by far. + * By reserving the data field for its exclusive use, + * instruction dispatch is sped up in that case. There is + * no need to fetch both the instruction and the data, + * only the data is needed. In other words, a `cycle' begins + * by fetching the field data. If that is non-0, then it must + * be the destination state of a next_char transition, so + * make that value the current state, advance the match position + * by one character, and start a new cycle. On the other hand, + * if data is 0, fetch the instruction and do a more complicated + * dispatch on that. + */ + +struct rx_inx +{ + void * data; + void * data_2; + void * inx; + void * fnord; +}; + +#ifndef RX_TAIL_ARRAY +#define RX_TAIL_ARRAY 1 +#endif + +/* A superstate corresponds to a set of nfa states. Those sets are + * represented by STRUCT RX_SUPERSET. The constructors + * guarantee that only one (shared) structure is created for a given set. + */ +struct rx_superset +{ + int refs; /* This is a reference counted structure. */ + + /* We keep these sets in a cache because (in an unpredictable way), + * the same set is often created again and again. But that is also + * problematic -- compatibility with POSIX and GNU regex requires + * that we not be able to tell when a program discards a particular + * NFA (thus invalidating the supersets created from it). + * + * But when a cache hit appears to occur, we will have in hand the + * nfa for which it may have happened. That is why every nfa is given + * its own sequence number. On a cache hit, the cache is validated + * by comparing the nfa sequence number to this field: + */ + int id; + + struct rx_nfa_state * car; /* May or may not be a valid addr. */ + struct rx_superset * cdr; + + /* If the corresponding superstate exists: */ + struct rx_superstate * superstate; + + + /* There is another bookkeeping problem. It is expensive to + * compute the starting nfa state set for an nfa. So, once computed, + * it is cached in the `struct rx'. + * + * But, the state set can be flushed from the superstate cache. + * When that happens, we can't know if the corresponding `struct rx' + * is still alive or if it has been freed or re-used by the program. + * So, the cached pointer to this set in a struct rx might be invalid + * and we need a way to validate it. + * + * Fortunately, even if this set is flushed from the cache, it is + * not freed. It just goes on the free-list of supersets. + * So we can still examine it. + * + * So to validate a starting set memo, check to see if the + * starts_for field still points back to the struct rx in question, + * and if the ID matches the rx sequence number. + */ + struct rx * starts_for; + + /* This is used to link into a hash bucket so these objects can + * be `hash-consed'. + */ + struct rx_hash_item hash_item; +}; + +#define rx_protect_superset(RX,CON) (++(CON)->refs) + +/* The terminology may be confusing (rename this structure?). + * Every character occurs in at most one rx_super_edge per super-state. + * But, that structure might have more than one option, indicating a point + * of non-determinism. + * + * In other words, this structure holds a list of superstate edges + * sharing a common starting state and character label. The edges + * are in the field OPTIONS. All superstate edges sharing the same + * starting state and character are in this list. + */ +struct rx_super_edge +{ + struct rx_super_edge *next; + struct rx_inx rx_backtrack_frame; + int cset_size; + rx_Bitset cset; + struct rx_distinct_future *options; +}; + +/* A superstate is a set of nfa states (RX_SUPERSET) along + * with a transition table. Superstates are built on demand and reclaimed + * without warning. To protect a superstate from this ghastly fate, + * use LOCK_SUPERSTATE. + */ +struct rx_superstate +{ + int rx_id; /* c.f. the id field of rx_superset */ + int locks; /* protection from reclamation */ + + /* Within a superstate cache, all the superstates are kept in a big + * queue. The tail of the queue is the state most likely to be + * reclaimed. The *recyclable fields hold the queue position of + * this state. + */ + struct rx_superstate * next_recyclable; + struct rx_superstate * prev_recyclable; + + /* The supernfa edges that exist in the cache and that have + * this state as their destination are kept in this list: + */ + struct rx_distinct_future * transition_refs; + + /* The list of nfa states corresponding to this superstate: */ + struct rx_superset * contents; + + /* The list of edges in the cache beginning from this state. */ + struct rx_super_edge * edges; + + /* A tail of the recyclable queue is marked as semifree. A semifree + * state has no incoming next_char transitions -- any transition + * into a semifree state causes a complex dispatch with the side + * effect of rescuing the state from its semifree state. + * + * An alternative to this might be to make next_char more expensive, + * and to move a state to the head of the recyclable queue whenever + * it is entered. That way, popular states would never be recycled. + * + * But unilaterally making next_char more expensive actually loses. + * So, incoming transitions are only made expensive for states near + * the tail of the recyclable queue. The more cache contention + * there is, the more frequently a state will have to prove itself + * and be moved back to the front of the queue. If there is less + * contention, then popular states just aggregate in the front of + * the queue and stay there. + */ + int is_semifree; + + + /* This keeps track of the size of the transition table for this + * state. There is a half-hearted attempt to support variable sized + * superstates. + */ + int trans_size; + + /* Indexed by characters... */ + struct rx_inx transitions[RX_TAIL_ARRAY]; +}; + + +/* A list of distinct futures define the edges that leave from a + * given superstate on a given character. c.f. rx_super_edge. + */ + +struct rx_distinct_future +{ + struct rx_distinct_future * next_same_super_edge[2]; + struct rx_distinct_future * next_same_dest; + struct rx_distinct_future * prev_same_dest; + struct rx_superstate * present; /* source state */ + struct rx_superstate * future; /* destination state */ + struct rx_super_edge * edge; + + + /* The future_frame holds the instruction that should be executed + * after all the side effects are done, when it is time to complete + * the transition to the next state. + * + * Normally this is a next_char instruction, but it may be a + * cache_miss instruction as well, depending on whether or not + * the superstate is in the cache and semifree. + * + * If this is the only future for a given superstate/char, and + * if there are no side effects to be performed, this frame is + * not used (directly) at all. Instead, its contents are copied + * into the transition table of the starting state of this dist. future. + */ + struct rx_inx future_frame; + + struct rx_inx side_effects_frame; + struct rx_se_list * effects; +}; + +#define rx_lock_superstate(R,S) ((S)->locks++) +#define rx_unlock_superstate(R,S) (--(S)->locks) + + +/* This page destined for rx.h */ + +struct rx_blocklist +{ + struct rx_blocklist * next; + int bytes; +}; + +struct rx_freelist +{ + struct rx_freelist * next; +}; + +struct rx_cache; + +#ifdef __STDC__ +typedef void (*rx_morecore_fn)(struct rx_cache *); +#else +typedef void (*rx_morecore_fn)(); +#endif + +/* You use this to control the allocation of superstate data + * during matching. Most of it should be initialized to 0. + * + * A MORECORE function is necessary. It should allocate + * a new block of memory or return 0. + * A default that uses malloc is called `rx_morecore'. + * + * The number of SUPERSTATES_ALLOWED indirectly limits how much memory + * the system will try to allocate. The default is 128. Batch style + * applications that are very regexp intensive should use as high a number + * as possible without thrashing. + * + * The LOCAL_CSET_SIZE is the number of characters in a character set. + * It is therefore the number of entries in a superstate transition table. + * Generally, it should be 256. If your character set has 16 bits, + * it is better to translate your regexps into equivalent 8 bit patterns. + */ + +struct rx_cache +{ + struct rx_hash_rules superset_hash_rules; + + /* Objects are allocated by incrementing a pointer that + * scans across rx_blocklists. + */ + struct rx_blocklist * memory; + struct rx_blocklist * memory_pos; + int bytes_left; + char * memory_addr; + rx_morecore_fn morecore; + + /* Freelists. */ + struct rx_freelist * free_superstates; + struct rx_freelist * free_transition_classes; + struct rx_freelist * free_discernable_futures; + struct rx_freelist * free_supersets; + struct rx_freelist * free_hash; + + /* Two sets of superstates -- those that are semifreed, and those + * that are being used. + */ + struct rx_superstate * lru_superstate; + struct rx_superstate * semifree_superstate; + + struct rx_superset * empty_superset; + + int superstates; + int semifree_superstates; + int hits; + int misses; + int superstates_allowed; + + int local_cset_size; + void ** instruction_table; + + struct rx_hash superset_table; +}; + + + +/* The lowest-level search function supports arbitrarily fragmented + * strings and (optionally) suspendable/resumable searches. + * + * Callers have to provide a few hooks. + */ + +#ifndef __GNUC__ +#ifdef __STDC__ +#define __const__ const +#else +#define __const__ +#endif +#endif + +/* This holds a matcher position */ +struct rx_string_position +{ + __const__ unsigned char * pos; /* The current pos. */ + __const__ unsigned char * string; /* The current string burst. */ + __const__ unsigned char * end; /* First invalid position >= POS. */ + int offset; /* Integer address of the current burst. */ + int size; /* Current string's size. */ + int search_direction; /* 1 or -1 */ + int search_end; /* First position to not try. */ +}; + + +enum rx_get_burst_return +{ + rx_get_burst_continuation, + rx_get_burst_error, + rx_get_burst_ok, + rx_get_burst_no_more +}; + + +/* A call to get burst should make POS valid. It might be invalid + * if the STRING field doesn't point to a burst that actually + * contains POS. + * + * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to + * whether or not to pad to the left. Padding to the right is always + * appropriate, but need not go past the point indicated by STOP. + * + * If a continuation is returned, then the reentering call to + * a search function will retry the get_burst. + */ + +#ifdef __STDC__ +typedef enum rx_get_burst_return + (*rx_get_burst_fn) (struct rx_string_position * pos, + void * app_closure, + int stop); + +#else < |