diff options
author | Eric Andersen <andersen@codepoet.org> | 2000-10-20 03:48:11 +0000 |
---|---|---|
committer | Eric Andersen <andersen@codepoet.org> | 2000-10-20 03:48:11 +0000 |
commit | 82d766043c6a8dcf6283788419f110dd7ab52f80 (patch) | |
tree | 09505131008d1b4d2178065878c3e8e0d54c26a2 | |
parent | 5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff) |
A smaller, kinder, gentler regexp implementation.
-rw-r--r-- | include/regex.h | 3770 | ||||
-rw-r--r-- | include/regexp.h | 423 | ||||
-rw-r--r-- | libc/misc/regex/Makefile | 2 | ||||
-rw-r--r-- | libc/misc/regex/regex.c | 5725 | ||||
-rw-r--r-- | libc/misc/regex/rx.c | 7273 |
5 files changed, 6215 insertions, 10978 deletions
diff --git a/include/regex.h b/include/regex.h index 64a8de685..113a32e72 100644 --- a/include/regex.h +++ b/include/regex.h @@ -1,1332 +1,62 @@ -#if !defined(_RX_H) || defined(RX_WANT_SE_DEFS) -#define _RX_H +/* Definitions for data structures and routines for the regular + expression library, version 0.12. + Copyright (C) 1985,1989-1993,1995-1998,2000 Free Software Foundation, Inc. -/* Copyright (C) 1992, 1993 Free Software Foundation, Inc. + This file is part of the GNU C Library. Its master source is NOT part of + the C library, however. The master source lives in /gd/gnu/lib. -This file is part of the librx library. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. -Librx is free software; you can redistribute it and/or modify it under -the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. -Librx is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ -You should have received a copy of the GNU Library General Public -License along with this software; see the file COPYING.LIB. If not, -write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA -02139, USA. */ -/* t. lord Wed Sep 23 18:20:57 1992 */ +#ifndef _REGEX_H +#define _REGEX_H 1 - - - -#include <features.h> - -#define __need_size_t -#include <stddef.h> - -#include <string.h> - -#if RX_WANT_SE_DEFS != 1 -__BEGIN_DECLS -#endif - -#ifndef RX_WANT_SE_DEFS - -/* This page: Bitsets */ - -#ifndef RX_subset -typedef unsigned int RX_subset; -#define RX_subset_bits (32) -#define RX_subset_mask (RX_subset_bits - 1) -#endif - -typedef RX_subset * rx_Bitset; - -#ifdef __STDC__ -typedef void (*rx_bitset_iterator) (void *, int member_index); -#else -typedef void (*rx_bitset_iterator) (); -#endif - -#define rx_bitset_subset(N) ((N) / RX_subset_bits) -#define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)]) -#define RX_bitset_access(B,N,OP) \ - ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask]) -#define RX_bitset_member(B,N) RX_bitset_access(B, N, &) -#define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=) -#define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~) -#define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= ) -#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits) -#define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset)) - - - -/* This page: Splay trees. */ - -#ifdef __STDC__ -typedef int (*rx_sp_comparer) (void * a, void * b); -#else -typedef int (*rx_sp_comparer) (); +/* Allow the use in C++ code. */ +#ifdef __cplusplus +extern "C" { #endif -struct rx_sp_node -{ - void * key; - void * data; - struct rx_sp_node * kids[2]; -}; +/* POSIX says that <sys/types.h> must be included (by the caller) before + <regex.h>. */ -#ifdef __STDC__ -typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *); -#else -typedef void (*rx_sp_key_data_freer) (); +#if !defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE && defined VMS +/* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it + should be there. */ +# include <stddef.h> #endif - -/* giant inflatable hash trees */ - -struct rx_hash_item -{ - struct rx_hash_item * next_same_hash; - struct rx_hash * table; - unsigned long hash; - void * data; - void * binding; -}; - -struct rx_hash -{ - struct rx_hash * parent; - int refs; - struct rx_hash * children[13]; - struct rx_hash_item * buckets [13]; - int bucket_size [13]; -}; - -struct rx_hash_rules; - -#ifdef __STDC__ -/* should return like == */ -typedef int (*rx_hash_eq)(void *, void *); -typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *); -typedef void (*rx_free_hash)(struct rx_hash *, - struct rx_hash_rules *); -typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *, - void *); -typedef void (*rx_free_hash_item)(struct rx_hash_item *, - struct rx_hash_rules *); -#else -typedef int (*rx_hash_eq)(); -typedef struct rx_hash * (*rx_alloc_hash)(); -typedef void (*rx_free_hash)(); -typedef struct rx_hash_item * (*rx_alloc_hash_item)(); -typedef void (*rx_free_hash_item)(); -#endif - -struct rx_hash_rules -{ - rx_hash_eq eq; - rx_alloc_hash hash_alloc; - rx_free_hash free_hash; - rx_alloc_hash_item hash_item_alloc; - rx_free_hash_item free_hash_item; -}; - - -/* Forward declarations */ - -struct rx_cache; -struct rx_superset; -struct rx; -struct rx_se_list; - - - -/* - * GLOSSARY - * - * regexp - * regular expression - * expression - * pattern - a `regular' expression. The expression - * need not be formally regular -- it can contain - * constructs that don't correspond to purely regular - * expressions. - * - * buffer - * string - the string (or strings) being searched or matched. - * - * pattern buffer - a structure of type `struct re_pattern_buffer' - * This in turn contains a `struct rx', which holds the - * NFA compiled from a pattern, as well as some of the state - * of a matcher using the pattern. - * - * NFA - nondeterministic finite automata. Some people - * use this term to a member of the class of - * regular automata (those corresponding to a regular - * language). However, in this code, the meaning is - * more general. The automata used by Rx are comperable - * in power to what are usually called `push down automata'. - * - * Two NFA are built by rx for every pattern. One is built - * by the compiler. The other is built from the first, on - * the fly, by the matcher. The latter is called the `superstate - * NFA' because its states correspond to sets of states from - * the first NFA. (Joe Keane gets credit for the name - * `superstate NFA'). - * - * NFA edges - * epsilon edges - * side-effect edges - The NFA compiled from a pattern can have three - * kinds of edges. Epsilon edges can be taken freely anytime - * their source state is reached. Character set edges can be - * taken when their source state is reached and when the next - * character in the buffer is a member of the set. Side effect - * edges imply a transition that can only be taken after the - * indicated side effect has been successfully accomplished. - * Some examples of side effects are: - * - * Storing the current match position to record the - * location of a parentesized subexpression. - * - * Advancing the matcher over N characters if they - * match the N characters previously matched by a - * parentesized subexpression. - * - * Both of those kinds of edges occur in the NFA generated - * by the pattern: \(.\)\1 - * - * Epsilon and side effect edges are similar. Unfortunately, - * some of the code uses the name `epsilon edge' to mean - * both epsilon and side effect edges. For example, the - * function has_non_idempotent_epsilon_path computes the existance - * of a non-trivial path containing only a mix of epsilon and - * side effect edges. In that case `nonidempotent epsilon' is being - * used to mean `side effect'. - */ - - - - - -/* LOW LEVEL PATTERN BUFFERS */ - -/* Suppose that from some NFA state, more than one path through - * side-effect edges is possible. In what order should the paths - * be tried? A function of type rx_se_list_order answers that - * question. It compares two lists of side effects, and says - * which list comes first. - */ - -#ifdef __STDC__ -typedef int (*rx_se_list_order) (struct rx *, - struct rx_se_list *, - struct rx_se_list *); -#else -typedef int (*rx_se_list_order) (); -#endif - - - -/* Struct RX holds a compiled regular expression - that is, an nfa - * ready to be converted on demand to a more efficient superstate nfa. - * This is for the low level interface. The high-level interfaces enclose - * this in a `struct re_pattern_buffer'. - */ -struct rx -{ - /* The compiler assigns a unique id to every pattern. - * Like sequence numbers in X, there is a subtle bug here - * if you use Rx in a system that runs for a long time. - * But, because of the way the caches work out, it is almost - * impossible to trigger the Rx version of this bug. - * - * The id is used to validate superstates found in a cache - * of superstates. It isn't sufficient to let a superstate - * point back to the rx for which it was compiled -- the caller - * may be re-using a `struct rx' in which case the superstate - * is not really valid. So instead, superstates are validated - * by checking the sequence number of the pattern for which - * they were built. - */ - int rx_id; - - /* This is memory mgt. state for superstates. This may be - * shared by more than one struct rx. - */ - struct rx_cache * cache; - - /* Every regex defines the size of its own character set. - * A superstate has an array of this size, with each element - * a `struct rx_inx'. So, don't make this number too large. - * In particular, don't make it 2^16. - */ - int local_cset_size; - - /* After the NFA is built, it is copied into a contiguous region - * of memory (mostly for compatability with GNU regex). - * Here is that region, and it's size: - */ - void * buffer; - unsigned long allocated; - - /* Clients of RX can ask for some extra storage in the space pointed - * to by BUFFER. The field RESERVED is an input parameter to the - * compiler. After compilation, this much space will be available - * at (buffer + allocated - reserved) - */ - unsigned long reserved; - - /* --------- The remaining fields are for internal use only. --------- */ - /* --------- But! they must be initialized to 0. --------- */ - - /* NODEC is the number of nodes in the NFA with non-epsilon - * transitions. - */ - int nodec; - - /* EPSNODEC is the number of nodes with only epsilon transitions. */ - int epsnodec; - - /* The sum (NODEC + EPSNODEC) is the total number of states in the - * compiled NFA. - */ - - /* Lists of side effects as stored in the NFA are `hash consed'..meaning - * that lists with the same elements are ==. During compilation, - * this table facilitates hash-consing. - */ - struct rx_hash se_list_memo; - - /* Lists of NFA states are also hashed. - */ - struct rx_hash set_list_memo; - - - - - /* The compiler and matcher must build a number of instruction frames. - * The format of these frames is fixed (c.f. struct rx_inx). The values - * of the instructions is not fixed. - * - * An enumerated type (enum rx_opcode) defines the set of instructions - * that the compiler or matcher might generate. When filling an instruction - * frame, the INX field is found by indexing this instruction table - * with an opcode: - */ - void ** instruction_table; - - /* The list of all states in an NFA. - * During compilation, the NEXT field of NFA states links this list. - * After compilation, all the states are compacted into an array, - * ordered by state id numbers. At that time, this points to the base - * of that array. - */ - struct rx_nfa_state *nfa_states; - - /* Every nfa begins with one distinguished starting state: - */ - struct rx_nfa_state *start; - - /* This orders the search through super-nfa paths. - * See the comment near the typedef of rx_se_list_order. - */ - rx_se_list_order se_list_cmp; - - struct rx_superset * start_set; -}; - - - - -/* SYNTAX TREES */ - -/* Compilation is in stages. - * - * In the first stage, a pattern specified by a string is - * translated into a syntax tree. Later stages will convert - * the syntax tree into an NFA optimized for conversion to a - * superstate-NFA. - * - * This page is about syntax trees. - */ - -enum rexp_node_type -{ - r_cset, /* Match from a character set. `a' or `[a-z]'*/ - r_concat, /* Concat two subexpressions. `ab' */ - r_alternate, /* Choose one of two subexpressions. `a\|b' */ - r_opt, /* Optional subexpression. `a?' */ - r_star, /* Repeated subexpression. `a*' */ - - - /* A 2phase-star is a variation on a repeated subexpression. - * In this case, there are two subexpressions. The first, if matched, - * begins a repitition (otherwise, the whole expression is matches the - * empth string). - * - * After matching the first subexpression, a 2phase star either finishes, - * or matches the second subexpression. If the second subexpression is - * matched, then the whole construct repeats. - * - * 2phase stars are used in two circumstances. First, they - * are used as part of the implementation of POSIX intervals (counted - * repititions). Second, they are used to implement proper star - * semantics when the repeated subexpression contains paths of - * only side effects. See rx_compile for more information. - */ - r_2phase_star, - - - /* c.f. "typedef void * rx_side_effect" */ - r_side_effect, - - /* This is an extension type: It is for transient use in source->source - * transformations (implemented over syntax trees). - */ - r_data -}; - -/* A side effect is a matcher-specific action associated with - * transitions in the NFA. The details of side effects are up - * to the matcher. To the compiler and superstate constructors - * side effects are opaque: - */ - -typedef void * rx_side_effect; - -/* Nodes in a syntax tree are of this type: - */ -struct rexp_node -{ - enum rexp_node_type type; - union - { - rx_Bitset cset; - rx_side_effect side_effect; - struct - { - struct rexp_node *left; - struct rexp_node *right; - } pair; - void * data; - } params; -}; - - - -/* NFA - * - * A syntax tree is compiled into an NFA. This page defines the structure - * of that NFA. - */ - -struct rx_nfa_state -{ - /* These are kept in a list as the NFA is being built. */ - struct rx_nfa_state *next; - - /* After the NFA is built, states are given integer id's. - * States whose outgoing transitions are all either epsilon or - * side effect edges are given ids less than 0. Other states - * are given successive non-negative ids starting from 0. - */ - int id; - - /* The list of NFA edges that go from this state to some other. */ - struct rx_nfa_edge *edges; - - /* If you land in this state, then you implicitly land - * in all other states reachable by only epsilon translations. - * Call the set of maximal paths to such states the epsilon closure - * of this state. - * - * There may be other states that are reachable by a mixture of - * epsilon and side effect edges. Consider the set of maximal paths - * of that sort from this state. Call it the epsilon-side-effect - * closure of the state. - * - * The epsilon closure of the state is a subset of the epsilon-side- - * effect closure. It consists of all the paths that contain - * no side effects -- only epsilon edges. - * - * The paths in the epsilon-side-effect closure can be partitioned - * into equivalance sets. Two paths are equivalant if they have the - * same set of side effects, in the same order. The epsilon-closure - * is one of these equivalance sets. Let's call these equivalance - * sets: observably equivalant path sets. That name is chosen - * because equivalance of two paths means they cause the same side - * effects -- so they lead to the same subsequent observations other - * than that they may wind up in different target states. - * - * The superstate nfa, which is derived from this nfa, is based on - * the observation that all of the paths in an observably equivalant - * path set can be explored at the same time, provided that the - * matcher keeps track not of a single nfa state, but of a set of - * states. In particular, after following all the paths in an - * observably equivalant set, you wind up at a set of target states. - * That set of target states corresponds to one state in the - * superstate NFA. - * - * Staticly, before matching begins, it is convenient to analyze the - * nfa. Each state is labeled with a list of the observably - * equivalant path sets who's union covers all the - * epsilon-side-effect paths beginning in this state. This list is - * called the possible futures of the state. - * - * A trivial example is this NFA: - * s1 - * A ---> B - * - * s2 - * ---> C - * - * epsilon s1 - * ---------> D ------> E - * - * - * In this example, A has two possible futures. - * One invokes the side effect `s1' and contains two paths, - * one ending in state B, the other in state E. - * The other invokes the side effect `s2' and contains only - * one path, landing in state C. - */ - struct rx_possible_future *futures; - - - /* There are exactly two distinguished states in every NFA: */ - unsigned int is_final:1; - unsigned int is_start:1; - - /* These are used during NFA construction... */ - unsigned int eclosure_needed:1; - unsigned int mark:1; -}; - - -/* An edge in an NFA is typed: */ -enum rx_nfa_etype -{ - /* A cset edge is labled with a set of characters one of which - * must be matched for the edge to be taken. - */ - ne_cset, - - /* An epsilon edge is taken whenever its starting state is - * reached. - */ - ne_epsilon, - - /* A side effect edge is taken whenever its starting state is - * reached. Side effects may cause the match to fail or the - * position of the matcher to advance. - */ - ne_side_effect /* A special kind of epsilon. */ -}; - -struct rx_nfa_edge -{ - struct rx_nfa_edge *next; - enum rx_nfa_etype type; - struct rx_nfa_state *dest; - union - { - rx_Bitset cset; - rx_side_effect side_effect; - } params; -}; - - - -/* A possible future consists of a list of side effects - * and a set of destination states. Below are their - * representations. These structures are hash-consed which - * means that lists with the same elements share a representation - * (their addresses are ==). - */ - -struct rx_nfa_state_set -{ - struct rx_nfa_state * car; - struct rx_nfa_state_set * cdr; -}; - -struct rx_se_list -{ - rx_side_effect car; - struct rx_se_list * cdr; -}; - -struct rx_possible_future -{ - struct rx_possible_future *next; - struct rx_se_list * effects; - struct rx_nfa_state_set * destset; -}; - - - -/* This begins the description of the superstate NFA. - * - * The superstate NFA corresponds to the NFA in these ways: - * - * Every superstate NFA states SUPER correspond to sets of NFA states, - * nfa_states(SUPER). - * - * Superstate edges correspond to NFA paths. - * - * The superstate has no epsilon transitions; - * every edge has a character label, and a (possibly empty) side - * effect label. The side effect label corresponds to a list of - * side effects that occur in the NFA. These parts are referred - * to as: superedge_character(EDGE) and superedge_sides(EDGE). - * - * For a superstate edge EDGE starting in some superstate SUPER, - * the following is true (in pseudo-notation :-): - * - * exists DEST in nfa_states s.t. - * exists nfaEDGE in nfa_edges s.t. - * origin (nfaEDGE) == DEST - * && origin (nfaEDGE) is a member of nfa_states(SUPER) - * && exists PF in possible_futures(dest(nfaEDGE)) s.t. - * sides_of_possible_future (PF) == superedge_sides (EDGE) - * - * also: - * - * let SUPER2 := superedge_destination(EDGE) - * nfa_states(SUPER2) - * == union of all nfa state sets S s.t. - * exists PF in possible_futures(dest(nfaEDGE)) s.t. - * sides_of_possible_future (PF) == superedge_sides (EDGE) - * && S == dests_of_possible_future (PF) } - * - * Or in english, every superstate is a set of nfa states. A given - * character and a superstate implies many transitions in the NFA -- - * those that begin with an edge labeled with that character from a - * state in the set corresponding to the superstate. - * - * The destinations of those transitions each have a set of possible - * futures. A possible future is a list of side effects and a set of - * destination NFA states. Two sets of possible futures can be - * `merged' by combining all pairs of possible futures that have the - * same side effects. A pair is combined by creating a new future - * with the same side effect but the union of the two destination sets. - * In this way, all the possible futures suggested by a superstate - * and a character can be merged into a set of possible futures where - * no two elements of the set have the same set of side effects. - * - * The destination of a possible future, being a set of NFA states, - * corresponds to a supernfa state. So, the merged set of possible - * futures we just created can serve as a set of edges in the - * supernfa. - * - * The representation of the superstate nfa and the nfa is critical. - * The nfa has to be compact, but has to facilitate the rapid - * computation of missing superstates. The superstate nfa has to - * be fast to interpret, lazilly constructed, and bounded in space. - * - * To facilitate interpretation, the superstate data structures are - * peppered with `instruction frames'. There is an instruction set - * defined below which matchers using the supernfa must be able to - * interpret. - * - * We'd like to make it possible but not mandatory to use code - * addresses to represent instructions (c.f. gcc's computed goto). - * Therefore, we define an enumerated type of opcodes, and when - * writing one of these instructions into a data structure, use - * the opcode as an index into a table of instruction values. - * - * Here are the opcodes that occur in the superstate nfa: - */ - - -/* Every superstate contains a table of instruction frames indexed - * by characters. A normal `move' in a matcher is to fetch the next - * character and use it as an index into a superstates transition - * table. - * - * In the fasted case, only one edge follows from that character. - * In other cases there is more work to do. - * - * The descriptions of the opcodes refer to data structures that are - * described further below. - */ - -enum rx_opcode -{ - /* - * BACKTRACK_POINT is invoked when a character transition in - * a superstate leads to more than one edge. In that case, - * the edges have to be explored independently using a backtracking - * strategy. - * - * A BACKTRACK_POINT instruction is stored in a superstate's - * transition table for some character when it is known that that - * character crosses more than one edge. On encountering this - * instruction, the matcher saves enough state to backtrack to this - * point in the match later. - */ - rx_backtrack_point = 0, /* data is (struct transition_class *) */ - - /* - * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. - * There is one occurence of this instruction per rx_distinct_future. - * This instruction is skipped if a rx_distinct_future has no side effects. - */ - rx_do_side_effects = rx_backtrack_point + 1, - - /* data is (struct rx_distinct_future *) */ - - /* - * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose - * destination superstate has been reclaimed (or was never built). - * It recomputes the destination superstate. - * RX_CACHE_MISS is also stored in a superstate transition table before - * any of its edges have been built. - */ - rx_cache_miss = rx_do_side_effects + 1, - /* data is (struct rx_distinct_future *) */ - - /* - * RX_NEXT_CHAR is called to consume the next character and take the - * corresponding transition. This is the only instruction that uses - * the DATA field of the instruction frame instead of DATA_2. - * (see EXPLORE_FUTURE in regex.c). - */ - rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ - - /* RX_BACKTRACK indicates that a transition fails. - */ - rx_backtrack = rx_next_char + 1, /* no data */ - - /* - * RX_ERROR_INX is stored only in places that should never be executed. - */ - rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ - - rx_num_instructions = rx_error_inx + 1 -}; - -/* An id_instruction_table holds the values stored in instruction - * frames. The table is indexed by the enums declared above. - */ -extern void * rx_id_instruction_table[rx_num_instructions]; - -/* The heart of the matcher is a `word-code-interpreter' - * (like a byte-code interpreter, except that instructions - * are a full word wide). - * - * Instructions are not stored in a vector of code, instead, - * they are scattered throughout the data structures built - * by the regexp compiler and the matcher. One word-code instruction, - * together with the arguments to that instruction, constitute - * an instruction frame (struct rx_inx). - * - * This structure type is padded by hand to a power of 2 because - * in one of the dominant cases, we dispatch by indexing a table - * of instruction frames. If that indexing can be accomplished - * by just a shift of the index, we're happy. - * - * Instructions take at most one argument, but there are two - * slots in an instruction frame that might hold that argument. - * These are called data and data_2. The data slot is only - * used for one instruction (RX_NEXT_CHAR). For all other - * instructions, data should be set to 0. - * - * RX_NEXT_CHAR is the most important instruction by far. - * By reserving the data field for its exclusive use, - * instruction dispatch is sped up in that case. There is - * no need to fetch both the instruction and the data, - * only the data is needed. In other words, a `cycle' begins - * by fetching the field data. If that is non-0, then it must - * be the destination state of a next_char transition, so - * make that value the current state, advance the match position - * by one character, and start a new cycle. On the other hand, - * if data is 0, fetch the instruction and do a more complicated - * dispatch on that. - */ - -struct rx_inx -{ - void * data; - void * data_2; - void * inx; - void * fnord; -}; - -#ifndef RX_TAIL_ARRAY -#define RX_TAIL_ARRAY 1 -#endif - -/* A superstate corresponds to a set of nfa states. Those sets are - * represented by STRUCT RX_SUPERSET. The constructors - * guarantee that only one (shared) structure is created for a given set. - */ -struct rx_superset -{ - int refs; /* This is a reference counted structure. */ - - /* We keep these sets in a cache because (in an unpredictable way), - * the same set is often created again and again. But that is also - * problematic -- compatibility with POSIX and GNU regex requires - * that we not be able to tell when a program discards a particular - * NFA (thus invalidating the supersets created from it). - * - * But when a cache hit appears to occur, we will have in hand the - * nfa for which it may have happened. That is why every nfa is given - * its own sequence number. On a cache hit, the cache is validated - * by comparing the nfa sequence number to this field: - */ - int id; - - struct rx_nfa_state * car; /* May or may not be a valid addr. */ - struct rx_superset * cdr; - - /* If the corresponding superstate exists: */ - struct rx_superstate * superstate; - - - /* There is another bookkeeping problem. It is expensive to - * compute the starting nfa state set for an nfa. So, once computed, - * it is cached in the `struct rx'. - * - * But, the state set can be flushed from the superstate cache. - * When that happens, we can't know if the corresponding `struct rx' - * is still alive or if it has been freed or re-used by the program. - * So, the cached pointer to this set in a struct rx might be invalid - * and we need a way to validate it. - * - * Fortunately, even if this set is flushed from the cache, it is - * not freed. It just goes on the free-list of supersets. - * So we can still examine it. - * - * So to validate a starting set memo, check to see if the - * starts_for field still points back to the struct rx in question, - * and if the ID matches the rx sequence number. - */ - struct rx * starts_for; - - /* This is used to link into a hash bucket so these objects can - * be `hash-consed'. - */ - struct rx_hash_item hash_item; -}; - -#define rx_protect_superset(RX,CON) (++(CON)->refs) - -/* The terminology may be confusing (rename this structure?). - * Every character occurs in at most one rx_super_edge per super-state. - * But, that structure might have more than one option, indicating a point - * of non-determinism. - * - * In other words, this structure holds a list of superstate edges - * sharing a common starting state and character label. The edges - * are in the field OPTIONS. All superstate edges sharing the same - * starting state and character are in this list. - */ -struct rx_super_edge -{ - struct rx_super_edge *next; - struct rx_inx rx_backtrack_frame; - int cset_size; - rx_Bitset cset; - struct rx_distinct_future *options; -}; - -/* A superstate is a set of nfa states (RX_SUPERSET) along - * with a transition table. Superstates are built on demand and reclaimed - * without warning. To protect a superstate from this ghastly fate, - * use LOCK_SUPERSTATE. - */ -struct rx_superstate -{ - int rx_id; /* c.f. the id field of rx_superset */ - int locks; /* protection from reclamation */ - - /* Within a superstate cache, all the superstates are kept in a big - * queue. The tail of the queue is the state most likely to be - * reclaimed. The *recyclable fields hold the queue position of - * this state. - */ - struct rx_superstate * next_recyclable; - struct rx_superstate * prev_recyclable; - - /* The supernfa edges that exist in the cache and that have - * this state as their destination are kept in this list: - */ - struct rx_distinct_future * transition_refs; - - /* The list of nfa states corresponding to this superstate: */ - struct rx_superset * contents; - - /* The list of edges in the cache beginning from this state. */ - struct rx_super_edge * edges; - - /* A tail of the recyclable queue is marked as semifree. A semifree - * state has no incoming next_char transitions -- any transition - * into a semifree state causes a complex dispatch with the side - * effect of rescuing the state from its semifree state. - * - * An alternative to this might be to make next_char more expensive, - * and to move a state to the head of the recyclable queue whenever - * it is entered. That way, popular states would never be recycled. - * - * But unilaterally making next_char more expensive actually loses. - * So, incoming transitions are only made expensive for states near - * the tail of the recyclable queue. The more cache contention - * there is, the more frequently a state will have to prove itself - * and be moved back to the front of the queue. If there is less - * contention, then popular states just aggregate in the front of - * the queue and stay there. - */ - int is_semifree; - - - /* This keeps track of the size of the transition table for this - * state. There is a half-hearted attempt to support variable sized - * superstates. - */ - int trans_size; - - /* Indexed by characters... */ - struct rx_inx transitions[RX_TAIL_ARRAY]; -}; - - -/* A list of distinct futures define the edges that leave from a - * given superstate on a given character. c.f. rx_super_edge. - */ - -struct rx_distinct_future -{ - struct rx_distinct_future * next_same_super_edge[2]; - struct rx_distinct_future * next_same_dest; - struct rx_distinct_future * prev_same_dest; - struct rx_superstate * present; /* source state */ - struct rx_superstate * future; /* destination state */ - struct rx_super_edge * edge; - - - /* The future_frame holds the instruction that should be executed - * after all the side effects are done, when it is time to complete - * the transition to the next state. - * - * Normally this is a next_char instruction, but it may be a - * cache_miss instruction as well, depending on whether or not - * the superstate is in the cache and semifree. - * - * If this is the only future for a given superstate/char, and - * if there are no side effects to be performed, this frame is - * not used (directly) at all. Instead, its contents are copied - * into the transition table of the starting state of this dist. future. - */ - struct rx_inx future_frame; - - struct rx_inx side_effects_frame; - struct rx_se_list * effects; -}; - -#define rx_lock_superstate(R,S) ((S)->locks++) -#define rx_unlock_superstate(R,S) (--(S)->locks) - - -/* This page destined for rx.h */ - -struct rx_blocklist -{ - struct rx_blocklist * next; - int bytes; -}; - -struct rx_freelist -{ - struct rx_freelist * next; -}; - -struct rx_cache; - -#ifdef __STDC__ -typedef void (*rx_morecore_fn)(struct rx_cache *); -#else -typedef void (*rx_morecore_fn)(); -#endif - -/* You use this to control the allocation of superstate data - * during matching. Most of it should be initialized to 0. - * - * A MORECORE function is necessary. It should allocate - * a new block of memory or return 0. - * A default that uses malloc is called `rx_morecore'. - * - * The number of SUPERSTATES_ALLOWED indirectly limits how much memory - * the system will try to allocate. The default is 128. Batch style - * applications that are very regexp intensive should use as high a number - * as possible without thrashing. - * - * The LOCAL_CSET_SIZE is the number of characters in a character set. - * It is therefore the number of entries in a superstate transition table. - * Generally, it should be 256. If your character set has 16 bits, - * it is better to translate your regexps into equivalent 8 bit patterns. - */ - -struct rx_cache -{ - struct rx_hash_rules superset_hash_rules; - - /* Objects are allocated by incrementing a pointer that - * scans across rx_blocklists. - */ - struct rx_blocklist * memory; - struct rx_blocklist * memory_pos; - int bytes_left; - char * memory_addr; - rx_morecore_fn morecore; - - /* Freelists. */ - struct rx_freelist * free_superstates; - struct rx_freelist * free_transition_classes; - struct rx_freelist * free_discernable_futures; - struct rx_freelist * free_supersets; - struct rx_freelist * free_hash; - - /* Two sets of superstates -- those that are semifreed, and those - * that are being used. - */ - struct rx_superstate * lru_superstate; - struct rx_superstate * semifree_superstate; - - struct rx_superset * empty_superset; - - int superstates; - int semifree_superstates; - int hits; - int misses; - int superstates_allowed; - - int local_cset_size; - void ** instruction_table; - - struct rx_hash superset_table; -}; - - - -/* The lowest-level search function supports arbitrarily fragmented - * strings and (optionally) suspendable/resumable searches. - * - * Callers have to provide a few hooks. - */ - -#ifndef __GNUC__ -#ifdef __STDC__ -#define __const__ const -#else -#define __const__ -#endif -#endif - -/* This holds a matcher position */ -struct rx_string_position -{ - __const__ unsigned char * pos; /* The current pos. */ - __const__ unsigned char * string; /* The current string burst. */ - __const__ unsigned char * end; /* First invalid position >= POS. */ - int offset; /* Integer address of the current burst. */ - int size; /* Current string's size. */ - int search_direction; /* 1 or -1 */ - int search_end; /* First position to not try. */ -}; - - -enum rx_get_burst_return -{ - rx_get_burst_continuation, - rx_get_burst_error, - rx_get_burst_ok, - rx_get_burst_no_more -}; - - -/* A call to get burst should make POS valid. It might be invalid - * if the STRING field doesn't point to a burst that actually - * contains POS. - * - * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to - * whether or not to pad to the left. Padding to the right is always - * appropriate, but need not go past the point indicated by STOP. - * - * If a continuation is returned, then the reentering call to - * a search function will retry the get_burst. - */ - -#ifdef __STDC__ -typedef enum rx_get_burst_return - (*rx_get_burst_fn) (struct rx_string_position * pos, - void * app_closure, - int stop); - -#else -typedef enum rx_get_burst_return (*rx_get_burst_fn) (); -#endif - - -enum rx_back_check_return -{ - rx_back_check_continuation, - rx_back_check_error, - rx_back_check_pass, - rx_back_check_fail -}; - -/* Back_check should advance the position it is passed - * over rparen - lparen characters and return pass iff - * the characters starting at POS match those indexed - * by [LPAREN..RPAREN]. - * - * If a continuation is returned, then the reentering call to - * a search function will retry the back_check. - */ - -#ifdef __STDC__ -typedef enum rx_back_check_return - (*rx_back_check_fn) (struct rx_string_position * pos, - int lparen, - int rparen, - unsigned char * translate, - void * app_closure, - int stop); - -#else -typedef enum rx_back_check_return (*rx_back_check_fn) (); -#endif - - - - -/* A call to fetch_char should return the character at POS or POS + 1. - * Returning continuations here isn't supported. OFFSET is either 0 or 1 - * and indicates which characters is desired. - */ - -#ifdef __STDC__ -typedef int (*rx_fetch_char_fn) (struct rx_string_position * pos, - int offset, - void * app_closure, - int stop); -#else -typedef int (*rx_fetch_char_fn) (); -#endif - - -enum rx_search_return -{ - rx_search_continuation = -4, - rx_search_error = -3, - rx_search_soft_fail = -2, /* failed by running out of string */ - rx_search_fail = -1 /* failed only by reaching failure states */ - /* return values >= 0 indicate the position of a successful match */ -}; - - - - - - -/* regex.h - * - * The remaining declarations replace regex.h. - */ - -/* This is an array of error messages corresponding to the error codes. - */ -extern __const__ char *re_error_msg[]; - -/* If any error codes are removed, changed, or added, update the - `re_error_msg' table in regex.c. */ -typedef enum -{ - REG_NOERROR = 0, /* Success. */ - REG_NOMATCH, /* Didn't find a match (for regexec). */ - - /* POSIX regcomp return error codes. (In the order listed in the - standard.) */ - REG_BADPAT, /* Invalid pattern. */ - REG_ECOLLATE, /* Not implemented. */ - REG_ECTYPE, /* Invalid character class name. */ - REG_EESCAPE, /* Trailing backslash. */ - REG_ESUBREG, /* Invalid back reference. */ - REG_EBRACK, /* Unmatched left bracket. */ - REG_EPAREN, /* Parenthesis imbalance. */ - REG_EBRACE, /* Unmatched \{. */ - REG_BADBR, /* Invalid contents of \{\}. */ - REG_ERANGE, /* Invalid range end. */ - REG_ESPACE, /* Ran out of memory. */ - REG_BADRPT, /* No preceding re for repetition op. */ - - /* Error codes we've added. */ - REG_EEND, /* Premature end. */ - REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ - REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ -} reg_errcode_t; - -/* The regex.c support, as a client of rx, defines a set of possible - * side effects that can be added to the edge lables of nfa edges. - * Here is the list of sidef effects in use. - */ - -enum re_side_effects -{ -#define RX_WANT_SE_DEFS 1 -#undef RX_DEF_SE -#undef RX_DEF_CPLX_SE -#define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE, -#define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE, -#include <regex.h> -#undef RX_DEF_SE -#undef RX_DEF_CPLX_SE -#undef RX_WANT_SE_DEFS - re_floogle_flap = 65533 -}; - -/* These hold paramaters for the kinds of side effects that are possible - * in the supported pattern languages. These include things like the - * numeric bounds of {} operators and the index of paren registers for - * subexpression measurement or backreferencing. - */ -struct re_se_params -{ - enum re_side_effects se; - int op1; - int op2; -}; +/* The following two types have to be signed and unsigned integer type + wide enough to hold a value of a pointer. For most ANSI compilers + ptrdiff_t and size_t should be likely OK. Still size of these two + types is 2 for Microsoft C. Ugh... */ +typedef long int s_reg_t; +typedef unsigned long int active_reg_t; -typedef unsigned reg_syntax_t; - -struct re_pattern_buffer -{ - struct rx rx; - reg_syntax_t syntax; /* See below for syntax bit definitions. */ - - unsigned int no_sub:1; /* If set, don't return register offsets. */ - unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */ - unsigned int not_eol:1; /* match at the ends of the string. */ - unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/ - unsigned int least_subs:1; /* If set, and returning registers, return - * as few values as possible. Only - * backreferenced groups and group 0 (the whole - * match) will be returned. - */ - - /* If true, this says that the matcher should keep registers on its - * backtracking stack. For many patterns, we can easily determine that - * this isn't necessary. - */ - unsigned int match_regs_on_stack:1; - unsigned int search_regs_on_stack:1; - - /* is_anchored and begbuf_only are filled in by rx_compile. */ - unsigned int is_anchored:1; /* Anchorded by ^? */ - unsigned int begbuf_only:1; /* Anchored to char position 0? */ - - - /* If REGS_UNALLOCATED, allocate space in the `regs' structure - * for `max (RE_NREGS, re_nsub + 1)' groups. - * If REGS_REALLOCATE, reallocate space if necessary. - * If REGS_FIXED, use what's there. - */ -#define REGS_UNALLOCATED 0 -#define REGS_REALLOCATE 1 -#define REGS_FIXED 2 - unsigned int regs_allocated:2; - - - /* Either a translate table to apply to all characters before - * comparing them, or zero for no translation. The translation - * is applied to a pattern when it is compiled and to a string - * when it is matched. - */ - unsigned char * translate; - - /* If this is a valid pointer, it tells rx not to store the extents of - * certain subexpressions (those corresponding to non-zero entries). - * Passing 0x1 is the same as passing an array of all ones. Passing 0x0 - * is the same as passing an array of all zeros. - * The array should contain as many entries as their are subexps in the - * regexp. - * - * For POSIX compatability, when using regcomp and regexec this field - * is zeroed and ignored. - */ - char * syntax_parens; - - /* Number of subexpressions found by the compiler. */ - size_t re_nsub; - - void * buffer; /* Malloced memory for the nfa. */ - unsigned long allocated; /* Size of that memory. */ - - /* Pointer to a fastmap, if any, otherwise zero. re_search uses - * the fastmap, if there is one, to skip over impossible - * starting points for matches. */ - char *fastmap; - - unsigned int fastmap_accurate:1; /* These three are internal. */ - unsigned int can_match_empty:1; - struct rx_nfa_state * start; /* The nfa starting state. */ - - /* This is the list of iterator bounds for {lo,hi} constructs. - * The memory pointed to is part of the rx->buffer. - */ - struct re_se_params *se_params; - - /* This is a bitset representation of the fastmap. - * This is a true fastmap that already takes the translate - * table into account. - */ - rx_Bitset fastset; -}; - -/* Type for byte offsets within the string. POSIX mandates this. */ -typedef int regoff_t; - -/* This is the structure we store register match data in. See - regex.texinfo for a full description of what registers match. */ -struct re_registers -{ - unsigned num_regs; - regoff_t *start; - regoff_t *end; -}; - -typedef struct re_pattern_buffer regex_t; - -/* POSIX specification for registers. Aside from the different names than - `re_registers', POSIX uses an array of structures, instead of a - structure of arrays. */ -typedef struct -{ - regoff_t rm_so; /* Byte offset from string's start to substring's start. */ - regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ -} regmatch_t; - - /* The following bits are used to determine the regexp syntax we recognize. The set/not-set meanings are chosen so that Emacs syntax remains the value 0. The bits are given in alphabetical order, and the definitions shifted by one from the previous bit; thus, when we add or remove a bit, only one other definition need change. */ +typedef unsigned long int reg_syntax_t; /* If this bit is not set, then \ inside a bracket expression is literal. If set, then such a \ quotes the following character. */ -#define RE_BACKSLASH_ESCAPE_IN_LISTS (1) +#define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1) /* If this bit is not set, then + and ? are operators, and \+ and \? are - literals. + literals. If set, then \+ and \? are operators and + and ? are literals. */ #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) @@ -1342,7 +72,7 @@ typedef struct ^ is an anchor if it is at the beginning of a regular expression or after an open-group or an alternation operator; $ is an anchor if it is at the end of a regular expression, or - before a close-group or an alternation operator. + before a close-group or an alternation operator. This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because POSIX draft 11.2 says that * etc. in leading positions is undefined. @@ -1353,7 +83,7 @@ typedef struct /* If this bit is set, then special characters are always special regardless of where they are in the pattern. If this bit is not set, then special characters are special only in - some contexts; otherwise they are ordinary. Specifically, + some contexts; otherwise they are ordinary. Specifically, * + ? and intervals are only special when not after the beginning, open-group, or alternation operator. */ #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) @@ -1375,7 +105,7 @@ typedef struct #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) /* If this bit is set, either \{...\} or {...} defines an - interval, depending on RE_NO_BK_BRACES. + interval, depending on RE_NO_BK_BRACES. If not set, \{, \}, {, and } are literals. */ #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) @@ -1400,7 +130,7 @@ typedef struct If not set, then \<digit> is a back-reference. */ #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) -/* If this bit is set, then | is an alternation operator, and \| is literal. +/* If this bit is set, then | is an alternation operator, and \| is literal. If not set, then \| is an alternation operator, and | is literal. */ #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) @@ -1414,9 +144,21 @@ typedef struct If not set, then an unmatched ) is invalid. */ #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) +/* If this bit is set, succeed as soon as we match the whole pattern, + without further backtracking. */ +#define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1) + /* If this bit is set, do not process the GNU regex operators. - IF not set, then the GNU regex operators are recognized. */ -#define RE_NO_GNU_OPS (RE_UNMATCHED_RIGHT_PAREN_ORD << 1) + If not set, then the GNU regex operators are recognized. */ +#define RE_NO_GNU_OPS (RE_NO_POSIX_BACKTRACKING << 1) + +/* If this bit is set, turn on internal regex debugging. + If not set, and debugging was on, turn it off. + This only works if regex.c is compiled -DDEBUG. + We define this bit always, so that all that's needed to turn on + debugging is to recompile regex.c; the calling code can always have + this bit set, and it won't affect anything in the normal case. */ +#define RE_DEBUG (RE_NO_GNU_OPS << 1) /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is @@ -1426,23 +168,24 @@ extern reg_syntax_t re_syntax_options; /* Define combinations of the above bits for the standard possibilities. (The [[[ comments delimit what gets put into the Texinfo file, so - don't delete them!) */ + don't delete them!) */ /* [[[begin syntaxes]]] */ #define RE_SYNTAX_EMACS 0 #define RE_SYNTAX_AWK \ - (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ - | RE_NO_BK_PARENS | RE_NO_BK_REFS \ - | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ - | RE_DOT_NEWLINE \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ + | RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS \ | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS) -#define RE_SYNTAX_GNU_AWK \ - ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) \ - & ~(RE_DOT_NOT_NULL|RE_INTERVALS)) +#define RE_SYNTAX_GNU_AWK \ + ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG) \ + & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS)) #define RE_SYNTAX_POSIX_AWK \ - (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_NO_GNU_OPS) + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ + | RE_INTERVALS | RE_NO_GNU_OPS) #define RE_SYNTAX_GREP \ (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ @@ -1478,10 +221,10 @@ extern reg_syntax_t re_syntax_options; (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) #define RE_SYNTAX_POSIX_EXTENDED \ - (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ - | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ - | RE_UNMATCHED_RIGHT_PAREN_ORD) + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD) /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ @@ -1491,15 +234,15 @@ extern reg_syntax_t re_syntax_options; | RE_NO_BK_PARENS | RE_NO_BK_REFS \ | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) /* [[[end syntaxes]]] */ - + /* Maximum number of duplicates an interval can allow. Some systems (erroneously) define this in other header files, but we want our value, so remove any previous define. */ #ifdef RE_DUP_MAX -#undef RE_DUP_MAX +# undef RE_DUP_MAX #endif -/* if sizeof(int) == 2, then ((1 << 15) - 1) overflows */ -#define RE_DUP_MAX (0x7fff) +/* If sizeof(int) == 2, then ((1 << 15) - 1) overflows. */ +#define RE_DUP_MAX (0x7fff) /* POSIX `cflags' bits (i.e., information for `regcomp'). */ @@ -1511,7 +254,7 @@ extern reg_syntax_t re_syntax_options; /* If this bit is set, then ignore case when matching. If not set, then case is significant. */ #define REG_ICASE (REG_EXTENDED << 1) - + /* If this bit is set, then anchors do not match at newline characters in the string. If not set, then anchors do match at newlines. */ @@ -1534,2221 +277,266 @@ extern reg_syntax_t re_syntax_options; /* Like REG_NOTBOL, except for the end-of-line. */ #define REG_NOTEOL (1 << 1) -/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, - * `re_match_2' returns information about at least this many registers - * the first time a `regs' structure is passed. - * - * Also, this is the greatest number of backreferenced subexpressions - * allowed in a pattern being matched without caller-supplied registers. - */ -#ifndef RE_NREGS -#define RE_NREGS 30 + +/* If any error codes are removed, changed, or added, update the + `re_error_msg' table in regex.c. */ +typedef enum +{ +#ifdef _XOPEN_SOURCE + REG_ENOSYS = -1, /* This will never happen for this implementation. */ #endif -extern int rx_cache_bound; -extern char rx_version_string[]; + REG_NOERROR = 0, /* Success. */ + REG_NOMATCH, /* Didn't find a match (for regexec). */ + /* POSIX regcomp return error codes. (In the order listed in the + standard.) */ + REG_BADPAT, /* Invalid pattern. */ + REG_ECOLLATE, /* Not implemented. */ + REG_ECTYPE, /* Invalid character class name. */ + REG_EESCAPE, /* Trailing backslash. */ + REG_ESUBREG, /* Invalid back reference. */ + REG_EBRACK, /* Unmatched left bracket. */ + REG_EPAREN, /* Parenthesis imbalance. */ + REG_EBRACE, /* Unmatched \{. */ + REG_BADBR, /* Invalid contents of \{\}. */ + REG_ERANGE, /* Invalid range end. */ + REG_ESPACE, /* Ran out of memory. */ + REG_BADRPT, /* No preceding re for repetition op. */ + /* Error codes we've added. */ + REG_EEND, /* Premature end. */ + REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ + REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ +} reg_errcode_t; -#ifdef RX_WANT_RX_DEFS +/* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields `buffer', `allocated', `fastmap', + `translate', and `no_sub' can be set. After the pattern has been + compiled, the `re_nsub' field is available. All other fields are + private to the regex routines. */ -/* This is decls to the interesting subsystems and lower layers - * of rx. Everything which doesn't have a public counterpart in - * regex.c is declared here. - */ +#ifndef RE_TRANSLATE_TYPE +# define RE_TRANSLATE_TYPE char * +#endif +struct re_pattern_buffer +{ +/* [[[begin pattern_buffer]]] */ + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; -#ifdef __STDC__ -typedef void (*rx_hash_freefn) (struct rx_hash_item * it); -#else /* ndef __STDC__ */ -typedef void (*rx_hash_freefn) (); -#endif /* ndef __STDC__ */ + /* Number of bytes to which `buffer' points. */ + unsigned long int allocated; + /* Number of bytes actually used in `buffer'. */ + unsigned long int used; - + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t syntax; -#ifdef __STDC__ -//RX_DECL int rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b); -RX_DECL int rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b); -//RX_DECL int rx_bitset_empty (int size, rx_Bitset set); -RX_DECL void rx_bitset_null (int size, rx_Bitset b); -RX_DECL void rx_bitset_universe (int size, rx_Bitset b); -RX_DECL void rx_bitset_complement (int size, rx_Bitset b); -RX_DECL void rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b); -RX_DECL void rx_bitset_union (int size, rx_Bitset a, rx_Bitset b); -RX_DECL void rx_bitset_intersection (int size, - rx_Bitset a, rx_Bitset b); -RX_DECL void rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b); -//RX_DECL void rx_bitset_revdifference (int size, -// rx_Bitset a, rx_Bitset b); -//RX_DECL void rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b); -RX_DECL unsigned long rx_bitset_hash (int size, rx_Bitset b); -RX_DECL struct rx_hash_item * rx_hash_find (struct rx_hash * table, - unsigned long hash, - void * value, - struct rx_hash_rules * rules); -RX_DECL struct rx_hash_item * rx_hash_store (struct rx_hash * table, - unsigned long hash, - void * value, - struct rx_hash_rules * rules); -RX_DECL void rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules); -RX_DECL void rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn, - struct rx_hash_rules * rules); -RX_DECL rx_Bitset rx_cset (struct rx *rx); -RX_DECL rx_Bitset rx_copy_cset (struct rx *rx, rx_Bitset a); -RX_DECL void rx_free_cset (struct rx * rx, rx_Bitset c); -RX_DECL struct rexp_node * rexp_node (struct rx *rx, - enum rexp_node_type type); -RX_DECL struct rexp_node * rx_mk_r_cset (struct rx * rx, - rx_Bitset b); -RX_DECL struct rexp_node * rx_mk_r_concat (struct rx * rx, - struct rexp_node * a, - struct rexp_node * b); -RX_DECL struct rexp_node * rx_mk_r_alternate (struct rx * rx, - struct rexp_node * a, - struct rexp_node * b); -RX_DECL struct rexp_node * rx_mk_r_opt (struct rx * rx, - struct rexp_node * a); -RX_DECL struct rexp_node * rx_mk_r_star (struct rx * rx, - struct rexp_node * a); -RX_DECL struct rexp_node * rx_mk_r_2phase_star (struct rx * rx, - struct rexp_node * a, - struct rexp_node * b); -RX_DECL struct rexp_node * rx_mk_r_side_effect (struct rx * rx, - rx_side_effect a); -//RX_DECL struct rexp_node * rx_mk_r_data (struct rx * rx, -// void * a); -RX_DECL void rx_free_rexp (struct rx * rx, struct rexp_node * node); -RX_DECL struct rexp_node * rx_copy_rexp (struct rx *rx, - struct rexp_node *node); -RX_DECL struct rx_nfa_state * rx_nfa_state (struct rx *rx); -RX_DECL void rx_free_nfa_state (struct rx_nfa_state * n); -RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (struct rx * rx, - int id); -RX_DECL struct rx_nfa_edge * rx_nfa_edge (struct rx *rx, - enum rx_nfa_etype type, - struct rx_nfa_state *start, - struct rx_nfa_state *dest); -RX_DECL void rx_free_nfa_edge (struct rx_nfa_edge * e); -RX_DECL void rx_free_nfa (struct rx *rx); -RX_DECL int rx_build_nfa (struct rx *rx, - struct rexp_node *rexp, - struct rx_nfa_state **start, - struct rx_nfa_state **end); -RX_DECL void rx_name_nfa_states (struct rx *rx); -RX_DECL int rx_eclose_nfa (struct rx *rx); -RX_DECL void rx_delete_epsilon_transitions (struct rx *rx); -RX_DECL int rx_compactify_nfa (struct rx *rx, - void **mem, unsigned long *size); -RX_DECL void rx_release_superset (struct rx *rx, - struct rx_superset *set); -RX_DECL struct rx_superset * rx_superset_cons (struct rx * rx, - struct rx_nfa_state *car, struct rx_superset *cdr); -RX_DECL struct rx_superset * rx_superstate_eclosure_union - (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl); -RX_DECL struct rx_superstate * rx_superstate (struct rx *rx, - struct rx_superset *set); -RX_DECL struct rx_inx * rx_handle_cache_miss - (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data); -RX_DECL reg_errcode_t rx_compile (__const__ char *pattern, int size, - reg_syntax_t syntax, - struct re_pattern_buffer * rxb); -RX_DECL void rx_blow_up_fastmap (struct re_pattern_buffer * rxb); -#else /* STDC */ -RX_DECL int rx_bitset_is_equal (); -RX_DECL int rx_bitset_is_subset (); -RX_DECL int rx_bitset_empty (); -RX_DECL void rx_bitset_null (); -RX_DECL void rx_bitset_universe (); -RX_DECL void rx_bitset_complement (); -RX_DECL void rx_bitset_assign (); -RX_DECL void rx_bitset_union (); -RX_DECL void rx_bitset_intersection (); -RX_DECL void rx_bitset_difference (); -RX_DECL void rx_bitset_revdifference (); -RX_DECL void rx_bitset_xor (); -RX_DECL unsigned long rx_bitset_hash (); -RX_DECL struct rx_hash_item * rx_hash_find (); -RX_DECL struct rx_hash_item * rx_hash_store (); -RX_DECL void rx_hash_free (); -RX_DECL void rx_free_hash_table (); -RX_DECL rx_Bitset rx_cset (); -RX_DECL rx_Bitset rx_copy_cset (); -RX_DECL void rx_free_cset (); -RX_DECL struct rexp_node * rexp_node (); -RX_DECL struct rexp_node * rx_mk_r_cset (); -RX_DECL struct rexp_node * rx_mk_r_concat (); -RX_DECL struct rexp_node * rx_mk_r_alternate (); -RX_DECL struct rexp_node * rx_mk_r_opt (); -RX_DECL struct rexp_node * rx_mk_r_star (); -RX_DECL struct rexp_node * rx_mk_r_2phase_star (); -RX_DECL struct rexp_node * rx_mk_r_side_effect (); -RX_DECL struct rexp_node * rx_mk_r_data (); -RX_DECL void rx_free_rexp (); -RX_DECL struct rexp_node * rx_copy_rexp (); -RX_DECL struct rx_nfa_state * rx_nfa_state (); -RX_DECL void rx_free_nfa_state (); -RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (); -RX_DECL struct rx_nfa_edge * rx_nfa_edge (); -RX_DECL void rx_free_nfa_edge (); -RX_DECL void rx_free_nfa (); -RX_DECL int rx_build_nfa (); -RX_DECL void rx_name_nfa_states (); -RX_DECL int rx_eclose_nfa (); -RX_DECL void rx_delete_epsilon_transitions (); -RX_DECL int rx_compactify_nfa (); -RX_DECL void rx_release_superset (); -RX_DECL struct rx_superset * rx_superset_cons (); -RX_DECL struct rx_superset * rx_superstate_eclosure_union (); -RX_DECL struct rx_superstate * rx_superstate (); -RX_DECL struct rx_inx * rx_handle_cache_miss (); -RX_DECL reg_errcode_t rx_compile (); -RX_DECL void rx_blow_up_fastmap (); -#endif /* STDC */ - - -#endif /* RX_WANT_RX_DEFS */ + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + is applied to a pattern when it is compiled and to a string + when it is matched. */ + RE_TRANSLATE_TYPE translate; - -#ifdef __STDC__ -extern int re_search_2 (struct re_pattern_buffer *rxb, - __const__ char * string1, int size1, - __const__ char * string2, int size2, - int startpos, int range, - struct re_registers *regs, - int stop); -extern int re_search (struct re_pattern_buffer * rxb, __const__ char *string, - int size, int startpos, int range, - struct re_registers *regs); -extern int re_match_2 (struct re_pattern_buffer * rxb, - __const__ char * string1, int size1, - __const__ char * string2, int size2, - int pos, struct re_registers *regs, int stop); -extern int re_match (struct re_pattern_buffer * rxb, - __const__ char * string, - int size, int pos, - struct re_registers *regs); -extern reg_syntax_t re_set_syntax (reg_syntax_t syntax); -extern void re_set_registers (struct re_pattern_buffer *bufp, - struct re_registers *regs, - unsigned num_regs, - regoff_t * starts, regoff_t * ends); -extern __const__ char * re_compile_pattern (__const__ char *pattern, - int length, - struct re_pattern_buffer * rxb); -extern int re_compile_fastmap (struct re_pattern_buffer * rxb); -extern char * re_comp (__const__ char *s); -extern int re_exec (__const__ char *s); -extern int regcomp (regex_t * preg, __const__ char * pattern, int cflags); -extern int regexec (__const__ regex_t *preg, __const__ char *string, - size_t nmatch, regmatch_t pmatch[], - int eflags); -extern size_t regerror (int errcode, __const__ regex_t *preg, - char *errbuf, size_t errbuf_size); -extern void regfree (regex_t *preg); - -#else /* STDC */ -extern int re_search_2 (); -extern int re_search (); -extern int re_match_2 (); -extern int re_match (); -extern reg_syntax_t re_set_syntax (); -extern void re_set_registers (); -extern __const__ char * re_compile_pattern (); -extern int re_compile_fastmap (); -extern char * re_comp (); -extern int re_exec (); -extern int regcomp (); -extern int regexec (); -extern size_t regerror (); -extern void regfree (); - -#endif /* STDC */ + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; - + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see `re_compile_fastmap' (the + `duplicate' case). */ + unsigned can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ +#define REGS_UNALLOCATED 0 +#define REGS_REALLOCATE 1 +#define REGS_FIXED 2 + unsigned regs_allocated : 2; -#ifdef RX_WANT_RX_DEFS + /* Set to zero when `regex_compile' compiles a pattern; set to one + by `re_compile_fastmap' if it updates the fastmap. */ + unsigned fastmap_accurate : 1; -struct rx_counter_frame -{ - int tag; - int val; - struct rx_counter_frame * inherited_from; /* If this is a copy. */ - struct rx_counter_frame * cdr; -}; + /* If set, `re_match_2' does not return information about + subexpressions. */ + unsigned no_sub : 1; -struct rx_backtrack_frame -{ - char * counter_stack_sp; - - /* A frame is used to save the matchers state when it crosses a - * backtracking point. The `stk_' fields correspond to variables - * in re_search_2 (just strip off thes `stk_'). They are documented - * tere. - */ - struct rx_superstate * stk_super; - unsigned int stk_c; - struct rx_string_position stk_test_pos; - int stk_last_l; - int stk_last_r; - int stk_test_ret; - - /* This is the list of options left to explore at the backtrack - * point for which this frame was created. - */ - struct rx_distinct_future * df; - struct rx_distinct_future * first_df; - -#ifdef RX_DEBUG - int stk_line_no; -#endif -}; + /* If set, a beginning-of-line anchor doesn't match at the + beginning of the string. */ + unsigned not_bol : 1; -struct rx_stack_chunk -{ - struct rx_stack_chunk * next_chunk; - int bytes_left; - char * sp; -}; + /* Similarly for an end-of-line anchor. */ + unsigned not_eol : 1; -enum rx_outer_entry -{ - rx_outer_start, - rx_outer_fastmap, - rx_outer_test, - rx_outer_restore_pos -}; + /* If true, an anchor at a newline matches. */ + unsigned newline_anchor : 1; -enum rx_fastmap_return -{ - rx_fastmap_continuation, - rx_fastmap_error, - rx_fastmap_ok, - rx_fastmap_fail +/* [[[end pattern_buffer]]] */ }; -enum rx_fastmap_entry -{ - rx_fastmap_start, - rx_fastmap_string_break -}; +typedef struct re_pattern_buffer regex_t; + +/* Type for byte offsets within the string. POSIX mandates this. */ +typedef int regoff_t; -enum rx_test_return -{ - rx_test_continuation, - rx_test_error, - rx_test_fail, - rx_test_ok -}; -enum rx_test_internal_return +/* This is the structure we store register match data in. See + regex.texinfo for a full description of what registers match. */ +struct re_registers { - rx_test_internal_error, - rx_test_found_first, - rx_test_line_finished + unsigned num_regs; + regoff_t *start; + regoff_t *end; }; -enum rx_test_match_entry -{ - rx_test_start, - rx_test_cache_hit_loop, - rx_test_backreference_check, - rx_test_backtrack_return -}; -struct rx_search_state -{ - /* Two groups of registers are kept. The group with the register state - * of the current test match, and the group that holds the state at the end - * of the best known match, if any. - * - * For some patterns, there may also be registers saved on the stack. - */ - unsigned num_regs; /* Includes an element for register zero. */ - regoff_t * lparen; /* scratch space for register returns */ - regoff_t * rparen; - regoff_t * best_lpspace; /* in case the user doesn't want these */ - regoff_t * best_rpspace; /* values, we still need space to store - * them. Normally, this memoryis unused - * and the space pointed to by REGS is - * used instead. - */ - - int last_l; /* Highest index of a valid lparen. */ - int last_r; /* It's dual. */ - - int * best_lparen; /* This contains the best known register */ - int * best_rparen; /* assignments. - * This may point to the same mem as - * best_lpspace, or it might point to memory - * passed by the caller. - */ - int best_last_l; /* best_last_l:best_lparen::last_l:lparen */ - int best_last_r; - - - unsigned char * translate; - - struct rx_string_position outer_pos; - - struct rx_superstate * start_super; - int nfa_choice; - int first_found; /* If true, return after finding any match. */ - int ret_val; - - /* For continuations... */ - enum rx_outer_entry outer_search_resume_pt; - struct re_pattern_buffer * saved_rxb; - int saved_startpos; - int saved_range; - int saved_stop; - int saved_total_size; - rx_get_burst_fn saved_get_burst; - rx_back_check_fn saved_back_check; - struct re_registers * saved_regs; - - /** - ** state for fastmap - **/ - char * fastmap; - int fastmap_chr; - int fastmap_val; - - /* for continuations in the fastmap procedure: */ - enum rx_fastmap_entry fastmap_resume_pt; - - /** - ** state for test_match - **/ - - /* The current superNFA position of the matcher. */ - struct rx_superstate * super; - - /* The matcher interprets a series of instruction frames. - * This is the `instruction counter' for the interpretation. - */ - struct rx_inx * ifr; - - /* We insert a ghost character in the string to prime - * the nfa. test_pos.pos, test_pos.str_half, and test_pos.end_half - * keep track of the test-match position and string-half. - */ - unsigned char c; - - /* Position within the string. */ - struct rx_string_position test_pos; - - struct rx_stack_chunk * counter_stack; - struct rx_stack_chunk * backtrack_stack; - int backtrack_frame_bytes; - int chunk_bytes; - struct rx_stack_chunk * free_chunks; - - /* To return from this function, set test_ret and - * `goto test_do_return'. - * - * Possible return values are: - * 1 --- end of string while the superNFA is still going - * 0 --- internal error (out of memory) - * -1 --- search completed by reaching the superNFA fail state - * -2 --- a match was found, maybe not the longest. - * - * When the search is complete (-1), best_last_r indicates whether - * a match was found. - * - * -2 is return only if search_state.first_found is non-zero. - * - * if search_state.first_found is non-zero, a return of -1 indicates no match, - * otherwise, best_last_r has to be checked. - */ - int test_ret; - - int could_have_continued; - -#ifdef RX_DEBUG - int backtrack_depth; - /* There is a search tree with every node as set of deterministic - * transitions in the super nfa. For every branch of a - * backtrack point is an edge in the tree. - * This counts up a pre-order of nodes in that tree. - * It's saved on the search stack and printed when debugging. - */ - int line_no; - int lines_found; +/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, + `re_match_2' returns information about at least this many registers + the first time a `regs' structure is passed. */ +#ifndef RE_NREGS +# define RE_NREGS 30 #endif - /* For continuations within the match tester */ - enum rx_test_match_entry test_match_resume_pt; - struct rx_inx * saved_next_tr_table; - struct rx_inx * saved_this_tr_table; - int saved_reg; - struct rx_backtrack_frame * saved_bf; - -}; -static __inline__ void init_fastmap( struct re_pattern_buffer *, - struct rx_search_state * ); - +/* POSIX specification for registers. Aside from the different names than + `re_registers', POSIX uses an array of structures, instead of a + structure of arrays. */ +typedef struct +{ + regoff_t rm_so; /* Byte offset from string's start to substring's start. */ + regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ +} regmatch_t; -extern char rx_slowmap[]; -extern unsigned char rx_id_translation[]; +/* Declarations for routines. */ -static __inline__ void -init_fastmap( struct re_pattern_buffer * rxb, - struct rx_search_state * search_state ) -{ - search_state->fastmap = (rxb->fastmap - ? (char *)rxb->fastmap - : (char *)rx_slowmap); - /* Update the fastmap now if not correct already. - * When the regexp was compiled, the fastmap was computed - * and stored in a bitset. This expands the bitset into a - * character array containing 1s and 0s. - */ - if ((search_state->fastmap == rxb->fastmap) && !rxb->fastmap_accurate) - rx_blow_up_fastmap (rxb); - search_state->fastmap_chr = -1; - search_state->fastmap_val = 0; - search_state->fastmap_resume_pt = rx_fastmap_start; -} +/* To avoid duplicating every routine declaration -- once with a + prototype (if we are ANSI), and once without (if we aren't) -- we + use the following macro to declare argument types. This + unfortunately clutters up the declarations a bit, but I think it's + worth it. */ -static __inline__ void -uninit_fastmap ( struct re_pattern_buffer * rxb, - struct rx_search_state * search_state ) -{ - /* Unset the fastmap sentinel */ - if (search_state->fastmap_chr >= 0) - search_state->fastmap[search_state->fastmap_chr] - = search_state->fastmap_val; -} +#if __STDC__ -static __inline__ int -fastmap_search ( struct re_pattern_buffer * rxb, int stop, - rx_get_burst_fn get_burst, void * app_closure, - struct rx_search_state * search_state ) -{ - enum rx_fastmap_entry pc; - - if (0) - { - return_continuation: - search_state->fastmap_resume_pt = pc; - return rx_fastmap_continuation; - } - - pc = search_state->fastmap_resume_pt; - - switch (pc) - { - default: - return rx_fastmap_error; - case rx_fastmap_start: - init_fastmap_sentinal: - /* For the sake of fast fastmapping, set a sentinal in the fastmap. - * This sentinal will trap the fastmap loop when it reaches the last - * valid character in a string half. - * - * This must be reset when the fastmap/search loop crosses a string - * boundry, and before returning to the caller. So sometimes, - * the fastmap loop is restarted with `continue', othertimes by - * `goto init_fastmap_sentinal'. - */ - if (search_state->outer_pos.size) - { - search_state->fastmap_chr = ((search_state->outer_pos.search_direction == 1) - ? *(search_state->outer_pos.end - 1) - : *search_state->outer_pos.string); - search_state->fastmap_val - = search_state->fastmap[search_state->fastmap_chr]; - search_state->fastmap[search_state->fastmap_chr] = 1; - } - else - { - search_state->fastmap_chr = -1; - search_state->fastmap_val = 0; - } - - if (search_state->outer_pos.pos >= search_state->outer_pos.end) - goto fastmap_hit_bound; - else - { - if (search_state->outer_pos.search_direction == 1) - { - if (search_state->fastmap_val) - { - for (;;) - { - while (!search_state->fastmap[*search_state->outer_pos.pos]) - ++search_state->outer_pos.pos; - return rx_fastmap_ok; - } - } - else - { - for (;;) - { - while (!search_state->fastmap[*search_state->outer_pos.pos]) - ++search_state->outer_pos.pos; - if (*search_state->outer_pos.pos != search_state->fastmap_chr) - return rx_fastmap_ok; - else - { - ++search_state->outer_pos.pos; - if (search_state->outer_pos.pos == search_state->outer_pos.end) - goto fastmap_hit_bound; - } - } - } - } - else - { - __const__ unsigned char * bound; - bound = search_state->outer_pos.string - 1; - if (search_state->fastmap_val) - { - for (;;) - { - while (!search_state->fastmap[*search_state->outer_pos.pos]) - --search_state->outer_pos.pos; - return rx_fastmap_ok; - } - } - else - { - for (;;) - { - while (!search_state->fastmap[*search_state->outer_pos.pos]) - --search_state->outer_pos.pos; - if ((*search_state->outer_pos.pos != search_state->fastmap_chr) || search_state->fastmap_val) - return rx_fastmap_ok; - else - { - --search_state->outer_pos.pos; - if (search_state->outer_pos.pos == bound) - goto fastmap_hit_bound; - } - } - } - } - } - - case rx_fastmap_string_break: - fastmap_hit_bound: - { - /* If we hit a bound, it may be time to fetch another burst - * of string, or it may be time to return a continuation to - * the caller, or it might be time to fail. - */ - - int burst_state; - burst_state = get_burst (&search_state->outer_pos, app_closure, stop); - switch (burst_state) - { - default: - case rx_get_burst_error: - return rx_fastmap_error; - case rx_get_burst_continuation: - { - pc = rx_fastmap_string_break; - goto return_continuation; - } - case rx_get_burst_ok: - goto init_fastmap_sentinal; - case rx_get_burst_no_more: - /* ...not a string split, simply no more string. - * - * When searching backward, running out of string - * is reason to quit. - * - * When searching forward, we allow the possibility - * of an (empty) match after the last character in the - * virtual string. So, fall through to the matcher - */ - return ( (search_state->outer_pos.search_direction == 1) - ? rx_fastmap_ok - : rx_fastmap_fail); - } - } - } +# define _RE_ARGS(args) args -} +#else /* not __STDC__ */ - +# define _RE_ARGS(args) () -#ifdef emacs -/* The `emacs' switch turns on certain matching commands - * that make sense only in Emacs. - */ -#include "config.h" -#include "lisp.h" -#include "buffer.h" -#include "syntax.h" -#endif /* emacs */ - -/* Setting RX_MEMDBUG is useful if you have dbmalloc. Maybe with similar - * packages too. - */ -#ifdef RX_MEMDBUG -#include <malloc.h> -#endif /* RX_RX_MEMDBUG */ - -/* We used to test for `BSTRING' here, but only GCC and Emacs define - * `BSTRING', as far as I know, and neither of them use this code. - */ -#if HAVE_STRING_H || __STDC__ -#include <string.h> - -#ifndef bcmp -#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) -#endif +#endif /* not __STDC__ */ -#ifndef bcopy -#define bcopy(s, d, n) memcpy ((d), (s), (n)) -#endif +/* Sets the current default syntax to SYNTAX, and return the old syntax. + You can also simply assign to the `re_syntax_options' variable. */ +extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); -#ifndef bzero -#define bzero(s, n) memset ((s), 0, (n)) -#endif +/* Compile the regular expression PATTERN, with length LENGTH + and syntax given by the global `re_syntax_options', into the buffer + BUFFER. Return NULL if successful, and an error string if not. */ +extern const char *re_compile_pattern + _RE_ARGS ((const char *pattern, size_t length, + struct re_pattern_buffer *buffer)); -#else /* HAVE_STRING_H || __STDC__ */ -#include <strings.h> -#endif /* not (HAVE_STRING_H || __STDC__) */ -#ifdef __STDC__ -#include <stdlib.h> -#else /* not __STDC__ */ -char *malloc (); -char *realloc (); -#endif /* not __STDC__ */ +/* Compile a fastmap for the compiled pattern in BUFFER; used to + accelerate searches. Return 0 if successful and -2 if was an + internal error. */ +extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); - +/* Search in the string STRING (with length LENGTH) for the pattern + compiled into BUFFER. Start searching at position START, for RANGE + characters. Return the starting position of the match, -1 for no + match, or -2 for an internal error. Also return register + information in REGS (if REGS and BUFFER->no_sub are nonzero). */ +extern int re_search + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, int range, struct re_registers *regs)); -/* How many characters in the character set. */ -#define CHAR_SET_SIZE (1 << CHARBITS) -#ifndef emacs -/* Define the syntax basics for \<, \>, etc. - * This must be nonzero for the wordchar and notwordchar pattern - * commands in re_match_2. - */ -#ifndef Sword -#define Sword 1 -#endif -#define SYNTAX(c) re_syntax_table[c] -RX_DECL char re_syntax_table[CHAR_SET_SIZE]; -#endif /* not emacs */ - - -/* Test if at very beginning or at very end of the virtual concatenation - * of `string1' and `string2'. If only one string, it's `string2'. - */ - -#define AT_STRINGS_BEG() \ - ( -1 \ - == ((search_state.test_pos.pos - search_state.test_pos.string) \ - + search_state.test_pos.offset)) - -#define AT_STRINGS_END() \ - ( (total_size - 1) \ - == ((search_state.test_pos.pos - search_state.test_pos.string) \ - + search_state.test_pos.offset)) - - -/* Test if POS + 1 points to a character which is word-constituent. We have - * two special cases to check for: if past the end of string1, look at - * the first character in string2; and if before the beginning of - * string2, look at the last character in string1. - * - * Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG (). - */ -#define LETTER_P(POS,OFF) \ - ( SYNTAX (fetch_char(POS, OFF, app_closure, stop)) \ - == Sword) - -/* Test if the character at D and the one after D differ with respect - * to being word-constituent. - */ -#define AT_WORD_BOUNDARY(d) \ - (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d,0) != LETTER_P (d, 1)) - - -#ifdef RX_SUPPORT_CONTINUATIONS -#define RX_STACK_ALLOC(BYTES) malloc(BYTES) -#define RX_STACK_FREE(MEM) free(MEM) -#else -#define RX_STACK_ALLOC(BYTES) alloca(BYTES) -#define RX_STACK_FREE(MEM) \ - ((struct rx_stack_chunk *)MEM)->next_chunk = search_state.free_chunks; \ - search_state.free_chunks = ((struct rx_stack_chunk *)MEM); +/* Like `re_search', but search in the concatenation of STRING1 and + STRING2. Also, stop searching at index START + STOP. */ +extern int re_search_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, int range, struct re_registers *regs, int stop)); -#endif -#define PUSH(CHUNK_VAR,BYTES) \ - if (!CHUNK_VAR || (CHUNK_VAR->bytes_left < (BYTES))) \ - { \ - struct rx_stack_chunk * new_chunk; \ - if (search_state.free_chunks) \ - { \ - new_chunk = search_state.free_chunks; \ - search_state.free_chunks = search_state.free_chunks->next_chunk; \ - } \ - else \ - { \ - new_chunk = (struct rx_stack_chunk *)RX_STACK_ALLOC(search_state.chunk_bytes); \ - if (!new_chunk) \ - { \ - search_state.ret_val = 0; \ - goto test_do_return; \ - } \ - } \ - new_chunk->sp = (char *)new_chunk + sizeof (struct rx_stack_chunk); \ - new_chunk->bytes_left = (search_state.chunk_bytes \ - - (BYTES) \ - - sizeof (struct rx_stack_chunk)); \ - new_chunk->next_chunk = CHUNK_VAR; \ - CHUNK_VAR = new_chunk; \ - } \ - else \ - (CHUNK_VAR->sp += (BYTES)), (CHUNK_VAR->bytes_left -= (BYTES)) - -#define POP(CHUNK_VAR,BYTES) \ - if (CHUNK_VAR->sp == ((char *)CHUNK_VAR + sizeof(*CHUNK_VAR))) \ - { \ - struct rx_stack_chunk * new_chunk = CHUNK_VAR->next_chunk; \ - RX_STACK_FREE(CHUNK_VAR); \ - CHUNK_VAR = new_chunk; \ - } \ - else \ - (CHUNK_VAR->sp -= BYTES), (CHUNK_VAR->bytes_left += BYTES) - - - -#define SRCH_TRANSLATE(C) search_state.translate[(unsigned char) (C)] +/* Like `re_search', but return how many characters in STRING the regexp + in BUFFER matched, starting at position START. */ +extern int re_match + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, struct re_registers *regs)); - +/* Relates to `re_match' as `re_search_2' relates to `re_search'. */ +extern int re_match_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, struct re_registers *regs, int stop)); -#ifdef __STDC__ -RX_DECL __inline__ int -rx_search (struct re_pattern_buffer * rxb, - int startpos, - int range, - int stop, - int total_size, - rx_get_burst_fn get_burst, - rx_back_check_fn back_check, - rx_fetch_char_fn fetch_char, - void * app_closure, - struct re_registers * regs, - struct rx_search_state * resume_state, - struct rx_search_state * save_state) -#else -RX_DECL __inline__ int -rx_search (rxb, startpos, range, stop, total_size, - get_burst, back_check, fetch_char, - app_closure, regs, resume_state, save_state) - struct re_pattern_buffer * rxb; - int startpos; - int range; - int stop; - int total_size; - rx_get_burst_fn get_burst; - rx_back_check_fn back_check; - rx_fetch_char_fn fetch_char; - void * app_closure; - struct re_registers * regs; - struct rx_search_state * resume_state; - struct rx_search_state * save_state; -#endif -{ - int pc; - int test_state; - struct rx_search_state search_state; - - search_state.free_chunks = 0; - if (!resume_state) - pc = rx_outer_start; - else - { - search_state = *resume_state; - regs = search_state.saved_regs; - rxb = search_state.saved_rxb; - startpos = search_state.saved_startpos; - range = search_state.saved_range; - stop = search_state.saved_stop; - total_size = search_state.saved_total_size; - get_burst = search_state.saved_get_burst; - back_check = search_state.saved_back_check; - pc = search_state.outer_search_resume_pt; - if (0) - { - return_continuation: - if (save_state) - { - *save_state = search_state; - save_state->saved_regs = regs; - save_state->saved_rxb = rxb; - save_state->saved_startpos = startpos; - save_state->saved_range = range; - save_state->saved_stop = stop; - save_state->saved_total_size = total_size; - save_state->saved_get_burst = get_burst; - save_state->saved_back_check = back_check; - save_state->outer_search_resume_pt = pc; - } - return rx_search_continuation; - } - } - - switch (pc) - { - case rx_outer_start: - search_state.ret_val = rx_search_fail; - ( search_state.lparen - = search_state.rparen - = search_state.best_lpspace - = search_state.best_rpspace - = 0); - - /* figure the number of registers we may need for use in backreferences. - * the number here includes an element for register zero. - */ - search_state.num_regs = rxb->re_nsub + 1; - - - /* check for out-of-range startpos. */ - if ((startpos < 0) || (startpos > total_size)) - return rx_search_fail; - - /* fix up range if it might eventually take us outside the string. */ - { - int endpos; - endpos = startpos + range; - if (endpos < -1) - range = (-1 - startpos); - else if (endpos > (total_size + 1)) - range = total_size - startpos; - } - - /* if the search isn't to be a backwards one, don't waste time in a - * long search for a pattern that says it is anchored. - */ - if (rxb->begbuf_only && (range > 0)) - { - if (startpos > 0) - return rx_search_fail; - else - range = 1; - } - - /* decide whether to use internal or user-provided reg buffers. */ - if (!regs || rxb->no_sub) - { - search_state.best_lpspace = - (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); - search_state.best_rpspace = - (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); - search_state.best_lparen = search_state.best_lpspace; - search_state.best_rparen = search_state.best_rpspace; - } - else - { - /* have the register data arrays been allocated? */ - if (rxb->regs_allocated == REGS_UNALLOCATED) - { /* no. so allocate them with malloc. we need one - extra element beyond `search_state.num_regs' for the `-1' marker - gnu code uses. */ - regs->num_regs = MAX (RE_NREGS, rxb->re_nsub + 1); - regs->start = ((regoff_t *) - malloc (regs->num_regs * sizeof ( regoff_t))); - regs->end = ((regoff_t *) - malloc (regs->num_regs * sizeof ( regoff_t))); - if (regs->start == 0 || regs->end == 0) - return rx_search_error; - rxb->regs_allocated = REGS_REALLOCATE; - } - else if (rxb->regs_allocated == REGS_REALLOCATE) - { /* yes. if we need more elements than were already - allocated, reallocate them. if we need fewer, just - leave it alone. */ - if (regs->num_regs < search_state.num_regs + 1) - { - regs->num_regs = search_state.num_regs + 1; - regs->start = ((regoff_t *) - realloc (regs->start, - regs->num_regs * sizeof (regoff_t))); - regs->end = ((regoff_t *) - realloc (regs->end, - regs->num_regs * sizeof ( regoff_t))); - if (regs->start == 0 || regs->end == 0) - return rx_search_error; - } - } - else if (rxb->regs_allocated != REGS_FIXED) - return rx_search_error; - - if (regs->num_regs < search_state.num_regs + 1) - { - search_state.best_lpspace = - ((regoff_t *) - REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t))); - search_state.best_rpspace = - ((regoff_t *) - REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t))); - search_state.best_lparen = search_state.best_lpspace; - search_state.best_rparen = search_state.best_rpspace; - } - else - { - search_state.best_lparen = regs->start; - search_state.best_rparen = regs->end; - } - } - - search_state.lparen = - (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); - search_state.rparen = - (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)); - - if (! ( search_state.best_rparen - && search_state.best_lparen - && search_state.lparen && search_state.rparen)) - return rx_search_error; - - search_state.best_last_l = search_state.best_last_r = -1; - - search_state.translate = (rxb->translate - ? rxb->translate - : rx_id_translation); - - - - /* - * two nfa's were compiled. - * `0' is complete. - * `1' faster but gets registers wrong and ends too soon. - */ - search_state.nfa_choice = (regs && !rxb->least_subs) ? '\0' : '\1'; - - /* we have the option to look for the best match or the first - * one we can find. if the user isn't asking for register information, - * we don't need to find the best match. - */ - search_state.first_found = !regs; - - if (range >= 0) - { - search_state.outer_pos.search_end = startpos + range; - search_state.outer_pos.search_direction = 1; - } - else - { - search_state.outer_pos.search_end = startpos + range; - search_state.outer_pos.search_direction = -1; - } - - /* the vacuous search always turns up nothing. */ - if ((search_state.outer_pos.search_direction == 1) - ? (startpos > search_state.outer_pos.search_end) - : (startpos < search_state.outer_pos.search_end)) - return rx_search_fail; - - /* now we build the starting state of the supernfa. */ - { - struct rx_superset * start_contents; - struct rx_nfa_state_set * start_nfa_set; - - /* we presume here that the nfa start state has only one - * possible future with no side effects. - */ - start_nfa_set = rxb->start->futures->destset; - if ( rxb->rx.start_set - && (rxb->rx.start_set->starts_for == &rxb->rx)) - start_contents = rxb->rx.start_set; - else - { - start_contents = - rx_superstate_eclosure_union (&rxb->rx, - rx_superset_cons (&rxb->rx, 0, 0), - start_nfa_set); - - if (!start_contents) - return rx_search_fail; - - start_contents->starts_for = &rxb->rx; - rxb->rx.start_set = start_contents; - } - if ( start_contents->superstate - && (start_contents->superstate->rx_id == rxb->rx.rx_id)) - { - search_state.start_super = start_contents->superstate; - rx_lock_superstate (&rxb->rx, search_state.start_super); - } - else - { - rx_protect_superset (&rxb->rx, start_contents); - - search_state.start_super = rx_superstate (&rxb->rx, start_contents); - if (!search_state.start_super) - return rx_search_fail; - rx_lock_superstate (&rxb->rx, search_state.start_super); - rx_release_superset (&rxb->rx, start_contents); - } - } - - - /* The outer_pos tracks the position within the strings - * as seen by loop that calls fastmap_search. - * - * The caller supplied get_burst function actually - * gives us pointers to chars. - * - * Communication with the get_burst function is through an - * rx_string_position structure. Here, the structure for - * outer_pos is initialized. It is set to point to the - * NULL string, at an offset of STARTPOS. STARTPOS is out - * of range of the NULL string, so the first call to - * getburst will patch up the rx_string_position to point - * to valid characters. - */ - - ( search_state.outer_pos.string - = search_state.outer_pos.end - = 0); - - search_state.outer_pos.offset = 0; - search_state.outer_pos.size = 0; - search_state.outer_pos.pos = (unsigned char *)startpos; - init_fastmap (rxb, &search_state); - - search_state.fastmap_resume_pt = rx_fastmap_start; - case rx_outer_fastmap: - /* do { */ - pseudo_do: - { - { - int fastmap_state; - fastmap_state = fastmap_search (rxb, stop, get_burst, app_closure, - &search_state); - switch (fastmap_state) - { - case rx_fastmap_continuation: - pc = rx_outer_fastmap; - goto return_continuation; - case rx_fastmap_fail: - goto finish; - case rx_fastmap_ok: - break; - } - } - - /* now the fastmap loop has brought us to a plausible - * starting point for a match. so, it's time to run the - * nfa and see if a match occured. - */ - startpos = ( search_state.outer_pos.pos - - search_state.outer_pos.string - + search_state.outer_pos.offset); -#if 0 -/*|*/ if ((range > 0) && (startpos == search_state.outer_pos.search_end)) -/*|*/ goto finish; -#endif - } - - search_state.test_match_resume_pt = rx_test_start; - /* do interrupted for entry point... */ - case rx_outer_test: - /* ...do continued */ - { - goto test_match; - test_returns_to_search: - switch (test_state) - { - case rx_test_continuation: - pc = rx_outer_test; - goto return_continuation; - case rx_test_error: - search_state.ret_val = rx_search_error; - goto finish; - case rx_test_fail: - break; - case rx_test_ok: - goto finish; - } - search_state.outer_pos.pos += search_state.outer_pos.search_direction; - startpos += search_state.outer_pos.search_direction; -#if 0 -/*|*/ if (search_state.test_pos.pos < search_state.test_pos.end) -/*|*/ break; -#endif - } - /* do interrupted for entry point... */ - case rx_outer_restore_pos: - { - int x; - x = get_burst (&search_state.outer_pos, app_closure, stop); - switch (x) - { - case rx_get_burst_continuation: - pc = rx_outer_restore_pos; - goto return_continuation; - case rx_get_burst_error: - search_state.ret_val = rx_search_error; - goto finish; - case rx_get_burst_no_more: - if (rxb->can_match_empty) - break; - goto finish; - case rx_get_burst_ok: - break; - } - } /* } while (...see below...) */ - - if ((search_state.outer_pos.search_direction == 1) - ? (startpos <= search_state.outer_pos.search_end) - : (startpos > search_state.outer_pos.search_end)) - goto pseudo_do; - - - finish: - uninit_fastmap (rxb, &search_state); - if (search_state.start_super) - rx_unlock_superstate (&rxb->rx, search_state.start_super); - -#ifdef regex_malloc - if (search_state.lparen) free (search_state.lparen); - if (search_state.rparen) free (search_state.rparen); - if (search_state.best_lpspace) free (search_state.best_lpspace); - if (search_state.best_rpspace) free (search_state.best_rpspace); -#endif - return search_state.ret_val; - } - - - test_match: - { - enum rx_test_match_entry test_pc; - int inx; - test_pc = search_state.test_match_resume_pt; - if (test_pc == rx_test_start) - { -#ifdef RX_DEBUG - search_state.backtrack_depth = 0; -#endif - search_state.last_l = search_state.last_r = 0; - search_state.lparen[0] = startpos; - search_state.super = search_state.start_super; - search_state.c = search_state.nfa_choice; - search_state.test_pos.pos = search_state.outer_pos.pos - 1; - search_state.test_pos.string = search_state.outer_pos.string; - search_state.test_pos.end = search_state.outer_pos.end; - search_state.test_pos.offset = search_state.outer_pos.offset; - search_state.test_pos.size = search_state.outer_pos.size; - search_state.test_pos.search_direction = 1; - search_state.counter_stack = 0; - search_state.backtrack_stack = 0; - search_state.backtrack_frame_bytes = - (sizeof (struct rx_backtrack_frame) - + (rxb->match_regs_on_stack - ? sizeof (regoff_t) * (search_state.num_regs + 1) * 2 - : 0)); - search_state.chunk_bytes = search_state.backtrack_frame_bytes * 64; - search_state.test_ret = rx_test_line_finished; - search_state.could_have_continued = 0; - } - /* This is while (1)...except that the body of the loop is interrupted - * by some alternative entry points. - */ - pseudo_while_1: - switch (test_pc) - { - case rx_test_cache_hit_loop: - goto resume_continuation_1; - case rx_test_backreference_check: - goto resume_continuation_2; - case rx_test_backtrack_return: - goto resume_continuation_3; - case rx_test_start: -#ifdef RX_DEBUG - /* There is a search tree with every node as set of deterministic - * transitions in the super nfa. For every branch of a - * backtrack point is an edge in the tree. - * This counts up a pre-order of nodes in that tree. - * It's saved on the search stack and printed when debugging. - */ - search_state.line_no = 0; - search_state.lines_found = 0; -#endif - - top_of_cycle: - /* A superstate is basicly a transition table, indexed by - * characters from the string being tested, and containing - * RX_INX (`instruction frame') structures. - */ - search_state.ifr = &search_state.super->transitions [search_state.c]; - - recurse_test_match: - /* This is the point to which control is sent when the - * test matcher `recurses'. Before jumping here, some variables - * need to be saved on the stack and the next instruction frame - * has to be computed. - */ - - restart: - /* Some instructions don't advance the matcher, but just - * carry out some side effects and fetch a new instruction. - * To dispatch that new instruction, `goto restart'. - */ - - { - struct rx_inx * next_tr_table = NULL; - struct rx_inx * this_tr_table = NULL; - - /* The fastest route through the loop is when the instruction - * is RX_NEXT_CHAR. This case is detected when SEARCH_STATE.IFR->DATA - * is non-zero. In that case, it points to the next - * superstate. - * - * This allows us to not bother fetching the bytecode. - */ - next_tr_table = (struct rx_inx *)search_state.ifr->data; - this_tr_table = search_state.super->transitions; - while (next_tr_table) - { -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - { - struct rx_superset * setp; - - fprintf (stderr, "%d %d>> re_next_char @ %d (%d)", - search_state.line_no, - search_state.backtrack_depth, - (search_state.test_pos.pos - search_state.test_pos.string - + search_state.test_pos.offset), search_state.c); - - search_state.super = - ((struct rx_superstate *) - ((char *)this_tr_table - - ((unsigned long) - ((struct rx_superstate *)0)->transitions))); - - setp = search_state.super->contents; - fprintf (stderr, " superstet (rx=%d, &=%x: ", - rxb->rx.rx_id, setp); - while (setp) - { - fprintf (stderr, "%d ", setp->id); - setp = setp->cdr; - } - fprintf (stderr, "\n"); - } -#endif - this_tr_table = next_tr_table; - ++search_state.test_pos.pos; - if (search_state.test_pos.pos == search_state.test_pos.end) - { - int burst_state; - try_burst_1: - burst_state = get_burst (&search_state.test_pos, - app_closure, stop); - switch (burst_state) - { - case rx_get_burst_continuation: - search_state.saved_this_tr_table = this_tr_table; - search_state.saved_next_tr_table = next_tr_table; - test_pc = rx_test_cache_hit_loop; - goto test_return_continuation; - - resume_continuation_1: - /* Continuation one jumps here to do its work: */ - search_state.saved_this_tr_table = this_tr_table; - search_state.saved_next_tr_table = next_tr_table; - goto try_burst_1; - - case rx_get_burst_ok: - /* get_burst succeeded...keep going */ - break; - - case rx_get_burst_no_more: - search_state.test_ret = rx_test_line_finished; - search_state.could_have_continued = 1; - goto test_do_return; - - case rx_get_burst_error: - /* An error... */ - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - } - } - search_state.c = *search_state.test_pos.pos; - search_state.ifr = this_tr_table + search_state.c; - next_tr_table = (struct rx_inx *)search_state.ifr->data; - } /* Fast loop through cached transition tables */ - - /* Here when we ran out of cached next-char transitions. - * So, it will be necessary to do a more expensive - * dispatch on the current instruction. The superstate - * pointer is allowed to become invalid during next-char - * transitions -- now we must bring it up to date. - */ - search_state.super = - ((struct rx_superstate *) - ((char *)this_tr_table - - ((unsigned long) - ((struct rx_superstate *)0)->transitions))); - } - - /* We've encountered an instruction other than next-char. - * Dispatch that instruction: - */ - inx = (int)search_state.ifr->inx; -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - { - struct rx_superset * setp = search_state.super->contents; - - fprintf (stderr, "%d %d>> %s @ %d (%d)", search_state.line_no, - search_state.backtrack_depth, - inx_names[inx], - (search_state.test_pos.pos - search_state.test_pos.string - + (test_pos.half == 0 ? 0 : size1)), search_state.c); - - fprintf (stderr, " superstet (rx=%d, &=%x: ", - rxb->rx.rx_id, setp); - while (setp) - { - fprintf (stderr, "%d ", setp->id); - setp = setp->cdr; - } - fprintf (stderr, "\n"); - } -#endif - switch ((enum rx_opcode)inx) - { - case rx_do_side_effects: - - /* RX_DO_SIDE_EFFECTS occurs when we cross epsilon - * edges associated with parentheses, backreferencing, etc. - */ - { - struct rx_distinct_future * df = - (struct rx_distinct_future *)search_state.ifr->data_2; - struct rx_se_list * el = df->effects; - /* Side effects come in lists. This walks down - * a list, dispatching. - */ - while (el) - { - long effect; - effect = (long)el->car; - if (effect < 0) - { -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - { - struct rx_superset * setp = search_state.super->contents; - - fprintf (stderr, "....%d %d>> %s\n", search_state.line_no, - search_state.backtrack_depth, - efnames[-effect]); - } -#endif - switch ((enum re_side_effects) effect) - - { - case re_se_pushback: - search_state.ifr = &df->future_frame; - if (!search_state.ifr->data) - { - struct rx_superstate * sup; - sup = search_state.super; - rx_lock_superstate (rx, sup); - if (!rx_handle_cache_miss (&rxb->rx, - search_state.super, - search_state.c, - (search_state.ifr - ->data_2))) - { - rx_unlock_superstate (rx, sup); - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - } - rx_unlock_superstate (rx, sup); - } - /* --search_state.test_pos.pos; */ - search_state.c = 't'; - search_state.super - = ((struct rx_superstate *) - ((char *)search_state.ifr->data - - (long)(((struct rx_superstate *)0) - ->transitions))); - goto top_of_cycle; - break; - case re_se_push0: - { - struct rx_counter_frame * old_cf - = (search_state.counter_stack - ? ((struct rx_counter_frame *) - search_state.counter_stack->sp) - : 0); - struct rx_counter_frame * cf; - PUSH (search_state.counter_stack, - sizeof (struct rx_counter_frame)); - cf = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - cf->tag = re_se_iter; - cf->val = 0; - cf->inherited_from = 0; - cf->cdr = old_cf; - break; - } - case re_se_fail: - goto test_do_return; - case re_se_begbuf: - if (!AT_STRINGS_BEG ()) - goto test_do_return; - break; - case re_se_endbuf: - if (!AT_STRINGS_END ()) - goto test_do_return; - break; - case re_se_wordbeg: - if ( LETTER_P (&search_state.test_pos, 1) - && ( AT_STRINGS_BEG() - || !LETTER_P (&search_state.test_pos, 0))) - break; - else - goto test_do_return; - case re_se_wordend: - if ( !AT_STRINGS_BEG () - && LETTER_P (&search_state.test_pos, 0) - && (AT_STRINGS_END () - || !LETTER_P (&search_state.test_pos, 1))) - break; - else - goto test_do_return; - case re_se_wordbound: - if (AT_WORD_BOUNDARY (&search_state.test_pos)) - break; - else - goto test_do_return; - case re_se_notwordbound: - if (!AT_WORD_BOUNDARY (&search_state.test_pos)) - break; - else - goto test_do_return; - case re_se_hat: - if (AT_STRINGS_BEG ()) - { - if (rxb->not_bol) - goto test_do_return; - else - break; - } - else - { - char pos_c = *search_state.test_pos.pos; - if ( (SRCH_TRANSLATE (pos_c) - == SRCH_TRANSLATE('\n')) - && rxb->newline_anchor) - break; - else - goto test_do_return; - } - case re_se_dollar: - if (AT_STRINGS_END ()) - { - if (rxb->not_eol) - goto test_do_return; - else - break; - } - else - { - if ( ( SRCH_TRANSLATE (fetch_char - (&search_state.test_pos, 1, - app_closure, stop)) - == SRCH_TRANSLATE ('\n')) - && rxb->newline_anchor) - break; - else - goto test_do_return; - } - - case re_se_try: - /* This is the first side effect in every - * expression. - * - * FOR NO GOOD REASON...get rid of it... - */ - break; - - case re_se_pushpos: - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - struct rx_counter_frame * old_cf - = (search_state.counter_stack - ? ((struct rx_counter_frame *) - search_state.counter_stack->sp) - : 0); - struct rx_counter_frame * cf; - PUSH(search_state.counter_stack, - sizeof (struct rx_counter_frame)); - cf = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - cf->tag = re_se_pushpos; - cf->val = urhere; - cf->inherited_from = 0; - cf->cdr = old_cf; - break; - } - - case re_se_chkpos: - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - struct rx_counter_frame * cf - = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - if (cf->val == urhere) - goto test_do_return; - cf->val = urhere; - break; - } - break; - - case re_se_poppos: - POP(search_state.counter_stack, - sizeof (struct rx_counter_frame)); - break; - - - case re_se_at_dot: - case re_se_syntax: - case re_se_not_syntax: -#ifdef emacs - /* - * this release lacks emacs support - */ -#endif - break; - case re_se_win: - case re_se_lparen: - case re_se_rparen: - case re_se_backref: - case re_se_iter: - case re_se_end_iter: - case re_se_tv: - case re_floogle_flap: - search_state.ret_val = 0; - goto test_do_return; - } - } - else - { -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - fprintf (stderr, "....%d %d>> %s %d %d\n", search_state.line_no, - search_state.backtrack_depth, - efnames2[rxb->se_params [effect].se], - rxb->se_params [effect].op1, - rxb->se_params [effect].op2); -#endif - switch (rxb->se_params [effect].se) - { - case re_se_win: - /* This side effect indicates that we've - * found a match, though not necessarily the - * best match. This is a fancy assignment to - * register 0 unless the caller didn't - * care about registers. In which case, - * this stops the match. - */ - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - - if ( (search_state.best_last_r < 0) - || (urhere + 1 > search_state.best_rparen[0])) - { - /* Record the best known and keep - * looking. - */ - int x; - for (x = 0; x <= search_state.last_l; ++x) - search_state.best_lparen[x] = search_state.lparen[x]; - search_state.best_last_l = search_state.last_l; - for (x = 0; x <= search_state.last_r; ++x) - search_state.best_rparen[x] = search_state.rparen[x]; - search_state.best_rparen[0] = urhere + 1; - search_state.best_last_r = search_state.last_r; - } - /* If we're not reporting the match-length - * or other register info, we need look no - * further. - */ - if (search_state.first_found) - { - search_state.test_ret = rx_test_found_first; - goto test_do_return; - } - } - break; - case re_se_lparen: - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - - int reg = rxb->se_params [effect].op1; -#if 0 - if (reg > search_state.last_l) -#endif - { - search_state.lparen[reg] = urhere + 1; - /* In addition to making this assignment, - * we now know that lower numbered regs - * that haven't already been assigned, - * won't be. We make sure they're - * filled with -1, so they can be - * recognized as unassigned. - */ - if (search_state.last_l < reg) - while (++search_state.last_l < reg) - search_state.lparen[search_state.last_l] = -1; - } - break; - } - - case re_se_rparen: - { - int urhere = - ((int)(search_state.test_pos.pos - - search_state.test_pos.string) - + search_state.test_pos.offset); - int reg = rxb->se_params [effect].op1; - search_state.rparen[reg] = urhere + 1; - if (search_state.last_r < reg) - { - while (++search_state.last_r < reg) - search_state.rparen[search_state.last_r] - = -1; - } - break; - } - - case re_se_backref: - { - int reg = rxb->se_params [effect].op1; - if ( reg > search_state.last_r - || search_state.rparen[reg] < 0) - goto test_do_return; - - { - int backref_status; - check_backreference: - backref_status - = back_check (&search_state.test_pos, - search_state.lparen[reg], - search_state.rparen[reg], - search_state.translate, - app_closure, - stop); - switch (backref_status) - { - case rx_back_check_continuation: - search_state.saved_reg = reg; - test_pc = rx_test_backreference_check; - goto test_return_continuation; - resume_continuation_2: - reg = search_state.saved_reg; - goto check_backreference; - case rx_back_check_fail: - /* Fail */ - goto test_do_return; - case rx_back_check_pass: - /* pass -- - * test_pos now advanced to last - * char matched by backref - */ - break; - } - } - break; - } - case re_se_iter: - { - struct rx_counter_frame * csp - = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - if (csp->val == rxb->se_params[effect].op2) - goto test_do_return; - else - ++csp->val; - break; - } - case re_se_end_iter: - { - struct rx_counter_frame * csp - = ((struct rx_counter_frame *) - search_state.counter_stack->sp); - if (csp->val < rxb->se_params[effect].op1) - goto test_do_return; - else - { - struct rx_counter_frame * source = csp; - while (source->inherited_from) - source = source->inherited_from; - if (!source || !source->cdr) - { - POP(search_state.counter_stack, - sizeof(struct rx_counter_frame)); - } - else - { - source = source->cdr; - csp->val = source->val; - csp->tag = source->tag; - csp->cdr = 0; - csp->inherited_from = source; - } - } - break; - } - case re_se_tv: - /* is a noop */ - break; - case re_se_try: - case re_se_pushback: - case re_se_push0: - case re_se_pushpos: - case re_se_chkpos: - case re_se_poppos: - case re_se_at_dot: - case re_se_syntax: - case re_se_not_syntax: - case re_se_begbuf: - case re_se_hat: - case re_se_wordbeg: - case re_se_wordbound: - case re_se_notwordbound: - case re_se_wordend: - case re_se_endbuf: - case re_se_dollar: - case re_se_fail: - case re_floogle_flap: - search_state.ret_val = 0; - goto test_do_return; - } - } - el = el->cdr; - } - /* Now the side effects are done, - * so get the next instruction. - * and move on. - */ - search_state.ifr = &df->future_frame; - goto restart; - } - - case rx_backtrack_point: - { - /* A backtrack point indicates that we've reached a - * non-determinism in the superstate NFA. This is a - * loop that exhaustively searches the possibilities. - * - * A backtracking strategy is used. We keep track of what - * registers are valid so we can erase side effects. - * - * First, make sure there is some stack space to hold - * our state. - */ - - struct rx_backtrack_frame * bf; - - PUSH(search_state.backtrack_stack, - search_state.backtrack_frame_bytes); -#ifdef RX_DEBUG_0 - ++search_state.backtrack_depth; -#endif - - bf = ((struct rx_backtrack_frame *) - search_state.backtrack_stack->sp); - { - bf->stk_super = search_state.super; - /* We prevent the current superstate from being - * deleted from the superstate cache. - */ - rx_lock_superstate (&rxb->rx, search_state.super); -#ifdef RX_DEBUG_0 - bf->stk_search_state.line_no = search_state.line_no; -#endif - bf->stk_c = search_state.c; - bf->stk_test_pos = search_state.test_pos; - bf->stk_last_l = search_state.last_l; - bf->stk_last_r = search_state.last_r; - bf->df = ((struct rx_super_edge *) - search_state.ifr->data_2)->options; - bf->first_df = bf->df; - bf->counter_stack_sp = (search_state.counter_stack - ? search_state.counter_stack->sp - : 0); - bf->stk_test_ret = search_state.test_ret; - if (rxb->match_regs_on_stack) - { - int x; - regoff_t * stk = - (regoff_t *)((char *)bf + sizeof (*bf)); - for (x = 0; x <= search_state.last_l; ++x) - stk[x] = search_state.lparen[x]; - stk += x; - for (x = 0; x <= search_state.last_r; ++x) - stk[x] = search_state.rparen[x]; - } - } - - /* Here is a while loop whose body is mainly a function - * call and some code to handle a return from that - * function. - * - * From here on for the rest of `case backtrack_point' it - * is unsafe to assume that the search_state copies of - * variables saved on the backtracking stack are valid - * -- so read their values from the backtracking stack. - * - * This lets us use one generation fewer stack saves in - * the call-graph of a search. - */ - - while_non_det_options: -#ifdef RX_DEBUG_0 - ++search_state.lines_found; - if (rx_debug_trace) - fprintf (stderr, "@@@ %d calls %d @@@\n", - search_state.line_no, search_state.lines_found); - - search_state.line_no = search_state.lines_found; -#endif - - if (bf->df->next_same_super_edge[0] == bf->first_df) - { - /* This is a tail-call optimization -- we don't recurse - * for the last of the possible futures. - */ - search_state.ifr = (bf->df->effects - ? &bf->df->side_effects_frame - : &bf->df->future_frame); - - rx_unlock_superstate (&rxb->rx, search_state.super); - POP(search_state.backtrack_stack, - search_state.backtrack_frame_bytes); -#ifdef RX_DEBUG - --search_state.backtrack_depth; -#endif - goto restart; - } - else - { - if (search_state.counter_stack) - { - struct rx_counter_frame * old_cf - = ((struct rx_counter_frame *)search_state.counter_stack->sp); - struct rx_counter_frame * cf; - PUSH(search_state.counter_stack, sizeof (struct rx_counter_frame)); - cf = ((struct rx_counter_frame *)search_state.counter_stack->sp); - cf->tag = old_cf->tag; - cf->val = old_cf->val; - cf->inherited_from = old_cf; - cf->cdr = 0; - } - /* `Call' this test-match block */ - search_state.ifr = (bf->df->effects - ? &bf->df->side_effects_frame - : &bf->df->future_frame); - goto recurse_test_match; - } - - /* Returns in this block are accomplished by - * goto test_do_return. There are two cases. - * If there is some search-stack left, - * then it is a return from a `recursive' call. - * If there is no search-stack left, then - * we should return to the fastmap/search loop. - */ - - test_do_return: - - if (!search_state.backtrack_stack) - { -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - fprintf (stderr, "!!! %d bails returning %d !!!\n", - search_state.line_no, search_state.test_ret); -#endif - - /* No more search-stack -- this test is done. */ - if (search_state.test_ret != rx_test_internal_error) - goto return_from_test_match; - else - goto error_in_testing_match; - } - - /* Returning from a recursive call to - * the test match block: - */ - - bf = ((struct rx_backtrack_frame *) - search_state.backtrack_stack->sp); -#ifdef RX_DEBUG_0 - if (rx_debug_trace) - fprintf (stderr, "+++ %d returns %d (to %d)+++\n", - search_state.line_no, - search_state.test_ret, - bf->stk_search_state.line_no); -#endif - - while (search_state.counter_stack - && (!bf->counter_stack_sp - || (bf->counter_stack_sp - != search_state.counter_stack->sp))) - { - POP(search_state.counter_stack, - sizeof (struct rx_counter_frame)); - } - - if (search_state.test_ret == rx_test_internal_error) - { - POP (search_state.backtrack_stack, - search_state.backtrack_frame_bytes); - goto test_do_return; - } - - /* If a non-longest match was found and that is good - * enough, return immediately. - */ - if ( (search_state.test_ret == rx_test_found_first) - && search_state.first_found) - { - rx_unlock_superstate (&rxb->rx, bf->stk_super); - POP (search_state.backtrack_stack, - search_state.backtrack_frame_bytes); - goto test_do_return; - } - - search_state.test_ret = bf->stk_test_ret; - search_state.last_l = bf->stk_last_l; - search_state.last_r = bf->stk_last_r; - bf->df = bf->df->next_same_super_edge[0]; - search_state.super = bf->stk_super; - search_state.c = bf->stk_c; -#ifdef RX_DEBUG_0 - search_state.line_no = bf->stk_search_state.line_no; -#endif - - if (rxb->match_regs_on_stack) - { - int x; - regoff_t * stk = - (regoff_t *)((char *)bf + sizeof (*bf)); - for (x = 0; x <= search_state.last_l; ++x) - search_state.lparen[x] = stk[x]; - stk += x; - for (x = 0; x <= search_state.last_r; ++x) - search_state.rparen[x] = stk[x]; - } - - if ((search_state.test_ret != rx_test_line_finished) && - (search_state.test_ret != rx_test_internal_error)) - { - int x; - try_burst_2: - x = get_burst (&bf->stk_test_pos, app_closure, stop); - switch (x) - { - case rx_get_burst_continuation: - search_state.saved_bf = bf; - test_pc = rx_test_backtrack_return; - goto test_return_continuation; - resume_continuation_3: - bf = search_state.saved_bf; - goto try_burst_2; - case rx_get_burst_no_more: - /* Since we've been here before, it is some kind of - * error that we can't return. - */ - case rx_get_burst_error: - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - case rx_get_burst_ok: - break; - } - } - search_state.test_pos = bf->stk_test_pos; - goto while_non_det_options; - } - - - case rx_cache_miss: - /* Because the superstate NFA is lazily constructed, - * and in fact may erode from underneath us, we sometimes - * have to construct the next instruction from the hard way. - * This invokes one step in the lazy-conversion. - */ - search_state.ifr = rx_handle_cache_miss (&rxb->rx, - search_state.super, - search_state.c, - search_state.ifr->data_2); - if (!search_state.ifr) - { - search_state.test_ret = rx_test_internal_error; - goto test_do_return; - } - goto restart; - - case rx_backtrack: - /* RX_BACKTRACK means that we've reached the empty - * superstate, indicating that match can't succeed - * from this point. - */ - goto test_do_return; - - case rx_next_char: - case rx_error_inx: - case rx_num_instructions: - search_state.ret_val = 0; - goto test_do_return; - } - goto pseudo_while_1; - } - - /* Healthy exits from the test-match loop do a - * `goto return_from_test_match' On the other hand, - * we might end up here. - */ - error_in_testing_match: - test_state = rx_test_error; - goto test_returns_to_search; - - /***** fastmap/search loop body - * considering the results testing for a match - */ - - return_from_test_match: - - if (search_state.best_last_l >= 0) - { - if (regs && (regs->start != search_state.best_lparen)) - { - bcopy (search_state.best_lparen, regs->start, - regs->num_regs * sizeof (int)); - bcopy (search_state.best_rparen, regs->end, - regs->num_regs * sizeof (int)); - } - if (regs && !rxb->no_sub) - { - int q; - int bound = (regs->num_regs < search_state.num_regs - ? regs->num_regs - : search_state.num_regs); - regoff_t * s = regs->start; - regoff_t * e = regs->end; - for (q = search_state.best_last_l + 1; q < bound; ++q) - s[q] = e[q] = -1; - } - search_state.ret_val = search_state.best_lparen[0]; - test_state = rx_test_ok; - goto test_returns_to_search; - } - else - { - test_state = rx_test_fail; - goto test_returns_to_search; - } - - test_return_continuation: - search_state.test_match_resume_pt = test_pc; - test_state = rx_test_continuation; - goto test_returns_to_search; - } -} +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using BUFFER and REGS will use this memory + for recording register information. STARTS and ENDS must be + allocated with malloc, and must each be at least `NUM_REGS * sizeof + (regoff_t)' bytes long. + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. -#endif /* RX_WANT_RX_DEFS */ - - - -#else /* RX_WANT_SE_DEFS */ - /* Integers are used to represent side effects. - * - * Simple side effects are given negative integer names by these enums. - * - * Non-negative names are reserved for complex effects. - * - * Complex effects are those that take arguments. For example, - * a register assignment associated with a group is complex because - * it requires an argument to tell which group is being matched. - * - * The integer name of a complex effect is an index into rxb->se_params. - */ - - RX_DEF_SE(1, re_se_try, = -1) /* Epsilon from start state */ - - RX_DEF_SE(0, re_se_pushback, = re_se_try - 1) - RX_DEF_SE(0, re_se_push0, = re_se_pushback -1) - RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1) - RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1) - RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1) - - RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1) /* Emacs only */ - RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */ - RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */ - - RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */ - RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */ - - RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1) - RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1) - RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1) - - RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1) - RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1) - - /* This fails except at the end of a line. - * It deserves to go here since it is typicly one of the last steps - * in a match. - */ - RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1) - - /* Simple effects: */ - RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1) - - /* Complex effects. These are used in the 'se' field of - * a struct re_se_params. Indexes into the se array - * are stored as instructions on nfa edges. - */ - RX_DEF_CPLX_SE(1, re_se_win, = 0) - RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1) - RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1) - RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1) - RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1) - RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1) - RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1) + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ +extern void re_set_registers + _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, + unsigned num_regs, regoff_t *starts, regoff_t *ends)); +#if defined _REGEX_RE_COMP || defined _LIBC +# ifndef _CRAY +/* 4.2 bsd compatibility. */ +extern char *re_comp _RE_ARGS ((const char *)); +extern int re_exec _RE_ARGS ((const char *)); +# endif #endif -#if RX_WANT_SE_DEFS != 1 -__END_DECLS -#endif +/* POSIX compatibility. */ +extern int regcomp _RE_ARGS ((regex_t *__preg, const char *__pattern, + int __cflags)); -#endif +extern int regexec _RE_ARGS ((const regex_t *__preg, + const char *__string, size_t __nmatch, + regmatch_t __pmatch[], int __eflags)); + +extern size_t regerror _RE_ARGS ((int __errcode, const regex_t *__preg, + char *__errbuf, size_t __errbuf_size)); + +extern void regfree _RE_ARGS ((regex_t *__preg)); + + +#ifdef __cplusplus +} +#endif /* C++ */ + +#endif /* regex.h */ + +/* +Local variables: +make-backup-files: t +version-control: t +trim-versions-without-asking: nil +End: +*/ diff --git a/include/regexp.h b/include/regexp.h index 174e10b75..fc60d3ca5 100644 --- a/include/regexp.h +++ b/include/regexp.h @@ -1,224 +1,221 @@ -/* - * regexp.h -- old-style regexp compile and step (emulated with POSIX regex) - * Copyright (C) 1993 Rick Sladkey <jrs@world.std.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Library Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Library Public License for more details. - */ - -/* - * Think really hard before you intentionally include this file. - * You should really be using the POSIX regex interface instead. - * This emulation file is intended solely for compiling old code. - * - * A program that uses this file must define six macros: INIT, - * GETC, PEEKC, UNGETC, RETURN, and ERROR. This interface is - * so arcane that VMS hackers point at it in ridicule. - */ +/* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ #ifndef _REGEXP_H -#define _REGEXP_H - -#include <sys/types.h> /* regex.h needs size_t */ -#include <regex.h> /* POSIX.2 regexp routines */ -#include <stdlib.h> /* for malloc, realloc and free */ - -/* - * These three advertised external variables record state information - * for compile and step. They are so gross, I'm choking as I write this. - */ -char *loc1; /* the beginning of a match */ -char *loc2; /* the end of a match */ -int circf; /* current pattern begins with '^' */ - -/* - * These are the other variables mentioned in the regexp.h manpage. - * Since we don't emulate them (whatever they do), we want errors if - * they are referenced. Therefore they are commented out here. - */ -#if 0 -char *locs; -int sed; -int nbra; -#endif +#define _REGEXP_H 1 + +/* The contents of this header file was first standardized in X/Open + System Interface and Headers Issue 2, originally coming from SysV. + In issue 4, version 2, it is marked as TO BE WITDRAWN. + + This code shouldn't be used in any newly written code. It is + included only for compatibility reasons. Use the POSIX definition + in <regex.h> for portable applications and a reasonable interface. */ + +#include <features.h> +#include <alloca.h> +#include <regex.h> +#include <stdlib.h> +#include <string.h> + +/* The implementation provided here emulates the needed functionality + by mapping to the POSIX regular expression matcher. The interface + for the here included function is weird (this really is a harmless + word). + + The user has to provide six macros before this header file can be + included: + + INIT Declarations vor variables which can be used by the + other macros. + + GETC() Return the value of the next character in the regular + expression pattern. Successive calls should return + successive characters. + + PEEKC() Return the value of the next character in the regular + expression pattern. Immediately successive calls to + PEEKC() should return the same character which should + also be the next character returned by GETC(). + + UNGETC(c) Cause `c' to be returned by the next call to GETC() and + PEEKC(). + + RETURN(ptr) Used for normal exit of the `compile' function. `ptr' + is a pointer to the character after the last character of + the compiled regular expression. + + ERROR(val) Used for abnormal return from `compile'. `val' is the + error number. The error codes are: + 11 Range endpoint too large. + 16 Bad number. + 25 \digit out of range. + 36 Illegal or missing delimiter. + 41 No remembered search string. + 42 \( \) imbalance. + 43 Too many \(. + 44 More tan two numbers given in \{ \}. + 45 } expected after \. + 46 First number exceeds second in \{ \}. + 49 [ ] imbalance. + 50 Regular expression overflow. + + */ + +__BEGIN_DECLS + +/* Interface variables. They contain the results of the successful + calls to `setp' and `advance'. */ +extern char *loc1; +extern char *loc2; + +/* The use of this variable in the `advance' function is not + supported. */ +extern char *locs; + -/* - * We need to stuff a regex_t into an arbitrary buffer so align it. - * GCC make this easy. For the others we have to guess. - */ -#ifdef __GNUC__ -#define __REGEX_T_ALIGN (__alignof__(regex_t)) -#else /* !__GNUC__ */ -#define __REGEX_T_ALIGN 8 -#endif /* !__GNUC__ */ - -#define __regex_t_align(p) \ - ((regex_t *) ((((unsigned long) p) + __REGEX_T_ALIGN - 1) \ - / __REGEX_T_ALIGN * __REGEX_T_ALIGN)) - -/* - * We just slurp the whole pattern into a string and then compile - * it `normally'. With this implementation we never use the PEEKC - * macro. Please feel free to die laughing when we translate - * error symbols into hard-coded numbers. - */ +#ifndef __DO_NOT_DEFINE_COMPILE +/* Get and compile the user supplied pattern up to end of line or + string or until EOF is seen, whatever happens first. The result is + placed in the buffer starting at EXPBUF and delimited by ENDBUF. + + This function cannot be defined in the libc itself since it depends + on the macros. */ char * -compile(char *instring, char *expbuf, char *endbuf, int eof) +compile (char *__restrict instring, char *__restrict expbuf, + __const char *__restrict endbuf, int eof) { - int __c; - int __len; - char *__buf; - int __buflen; - int __error; - regex_t *__preg; - INIT; - - __buflen = 128; - __buf = malloc(__buflen); - if (!__buf) { - ERROR(50); - return 0; - } - __len = 0; - circf = 0; - for (;;) { - __c = GETC(); - if (__c == eof) - break; - if (__c == '\0' || __c == '\n') { - UNGETC(__c); - break; - } - if (__len + 2 > __buflen) { - __buflen *= 2; - __buf = realloc(__buf, __buflen); - if (!__buf) { - ERROR(50); - return 0; - } - } - if (__len == 0 && !circf && __c == '^') - circf = 1; - else - __buf[__len++] = __c; - } - if (__len == 0 && !circf) { - free(__buf); - ERROR(41); - return 0; - } - __buf[__len] = '\0'; - if (endbuf <= expbuf + sizeof(regex_t)) { - free(__buf); - ERROR(50); - return 0; - } - __preg = __regex_t_align(expbuf); - __preg->buffer = (char *) (__preg + 1); - __preg->allocated = endbuf - (char *) __preg->buffer; - __error = regcomp(__preg, __buf, REG_NEWLINE); - free(__buf); - switch (__error) { - case 0: - break; - case REG_BADRPT: - __error = 36; /* poor fit */ - break; - case REG_BADBR: - __error = 16; - break; - case REG_EBRACE: - __error = 44; /* poor fit */ - break; - case REG_EBRACK: - __error = 49; - break; - case REG_ERANGE: - __error = 36; /* poor fit */ - break; - case REG_ECTYPE: - __error = 36; /* poor fit */ - break; - case REG_EPAREN: - __error = 42; - break; - case REG_ESUBREG: - __error = 36; /* poor fit */ - break; - case REG_EEND: - __error = 36; /* poor fit */ - break; - case REG_EESCAPE: - __error = 36; - break; - case REG_BADPAT: - __error = 36; /* poor fit */ - break; - case REG_ESIZE: - __error = 50; - break; - case REG_ESPACE: - __error = 50; - break; - default: - __error = 36; /* as good as any */ - break; - } - if (__error) { - ERROR(__error); - return 0; + char *__input_buffer = NULL; + size_t __input_size = 0; + size_t __current_size = 0; + int __ch; + int __error; + INIT + + /* Align the expression buffer according to the needs for an object + of type `regex_t'. Then check for minimum size of the buffer for + the compiled regular expression. */ + regex_t *__expr_ptr; +# if defined __GNUC__ && __GNUC__ >= 2 + const size_t __req = __alignof__ (regex_t *); +# else + /* How shall we find out? We simply guess it and can change it is + this really proofs to be wrong. */ + const size_t __req = 8; +# endif + expbuf += __req; + expbuf -= (expbuf - ((char *) 0)) % __req; + if (endbuf < expbuf + sizeof (regex_t)) + { + ERROR (50); + } + __expr_ptr = (regex_t *) expbuf; + /* The remaining space in the buffer can be used for the compiled + pattern. */ + __expr_ptr->buffer = expbuf + sizeof (regex_t); + __expr_ptr->allocated = endbuf - (char *) __expr_ptr->buffer; + + while ((__ch = (GETC ())) != eof) + { + if (__ch == '\0' || __ch == '\n') + { + UNGETC (__ch); + break; } -#ifdef _RX_H - RETURN((__preg->buffer + __preg->rx.allocated - __preg->rx.reserved)); -#else - RETURN((__preg->buffer + __preg->used)); -#endif -} -/* - * Note how we carefully emulate the gross `circf' hack. Otherwise, - * this just looks like an ordinary matching call that records the - * starting and ending match positions. - */ -int -step(char *string, char *expbuf) -{ - int __result; - regmatch_t __pmatch[1]; - - __result = regexec(__regex_t_align(expbuf), string, 1, __pmatch, 0); - if (circf && __pmatch[0].rm_so != 0) - __result = REG_NOMATCH; - if (__result == 0) { - loc1 = string + __pmatch[0].rm_so; - loc2 = string + __pmatch[0].rm_eo; + if (__current_size + 1 >= __input_size) + { + size_t __new_size = __input_size ? 2 * __input_size : 128; + char *__new_room = (char *) alloca (__new_size); + /* See whether we can use the old buffer. */ + if (__new_room + __new_size == __input_buffer) + { + __input_size += __new_size; + __input_buffer = (char *) memcpy (__new_room, __input_buffer, + __current_size); + } + else if (__input_buffer + __input_size == __new_room) + __input_size += __new_size; + else + { + __input_size = __new_size; + __input_buffer = (char *) memcpy (__new_room, __input_buffer, + __current_size); + } } - return __result == 0; -} + __input_buffer[__current_size++] = __ch; + } + __input_buffer[__current_size++] = '\0'; -/* - * For advance we are only supposed to match at the beginning of the - * string. You have to read the man page really carefully to find this - * one. We'll match them kludge-for-kludge. - */ -int -advance(char *string, char *expbuf) -{ - int __old_circf; - int __result; - - __old_circf = circf; - circf = 1; - __result = step(string, expbuf); - circf = __old_circf; - return __result; + /* Now compile the pattern. */ + __error = regcomp (__expr_ptr, __input_buffer, REG_NEWLINE); + if (__error != 0) + /* Oh well, we have to translate POSIX error codes. */ + switch (__error) + { + case REG_BADPAT: + case REG_ECOLLATE: + case REG_ECTYPE: + case REG_EESCAPE: + case REG_BADRPT: + case REG_EEND: + case REG_ERPAREN: + default: + /* There is no matching error code. */ + RETURN (36); + case REG_ESUBREG: + RETURN (25); + case REG_EBRACK: + RETURN (49); + case REG_EPAREN: + RETURN (42); + case REG_EBRACE: + RETURN (44); + case REG_BADBR: + RETURN (46); + case REG_ERANGE: + RETURN (11); + case REG_ESPACE: + case REG_ESIZE: + ERROR (50); + } + + /* Everything is ok. */ + RETURN ((char *) (__expr_ptr->buffer + __expr_ptr->used)); } +#endif + + +/* Find the next match in STRING. The compiled regular expression is + found in the buffer starting at EXPBUF. `loc1' will return the + first character matched and `loc2' points to the next unmatched + character. */ +extern int step __P ((__const char *__restrict __string, + __const char *__restrict __expbuf)); + +/* Match the beginning of STRING with the compiled regular expression + in EXPBUF. If the match is successful `loc2' will contain the + position of the first unmatched character. */ +extern int advance __P ((__const char *__restrict __string, + __const char *__restrict __expbuf)); + + +__END_DECLS -#endif /* _REGEXP_H */ +#endif /* regexp.h */ diff --git a/libc/misc/regex/Makefile b/libc/misc/regex/Makefile index c4c13f6cf..38b7e98bf 100644 --- a/libc/misc/regex/Makefile +++ b/libc/misc/regex/Makefile @@ -24,7 +24,7 @@ TOPDIR=../../ include $(TOPDIR)Rules.mak LIBC=$(TOPDIR)libc.a -CSRC=rx.c +CSRC=regex.c COBJS=$(patsubst %.c,%.o, $(CSRC)) OBJS=$(COBJS) diff --git a/libc/misc/regex/regex.c b/libc/misc/regex/regex.c new file mode 100644 index 000000000..64e754ee0 --- /dev/null +++ b/libc/misc/regex/regex.c @@ -0,0 +1,5725 @@ +/* Extended regular expression matching and search library, + version 0.12. + (Implements POSIX draft P1003.2/D11.2, except for some of the + internationalization features.) + Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* AIX requires this to be the first thing in the file. */ +#if defined _AIX && !defined REGEX_MALLOC +#pragma alloca +#endif + +#undef _GNU_SOURCE +#define _GNU_SOURCE +#define STDC_HEADERS + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#ifndef PARAMS +# if defined __GNUC__ || (defined __STDC__ && __STDC__) +# define PARAMS(args) args +# else +# define PARAMS(args) () +# endif /* GCC. */ +#endif /* Not PARAMS. */ + +#if defined STDC_HEADERS && !defined emacs +# include <stddef.h> +#else +/* We need this for `regex.h', and perhaps for the Emacs include files. */ +# include <sys/types.h> +#endif + +#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) + +/* For platform which support the ISO C amendement 1 functionality we + support user defined character classes. */ +#if defined _LIBC || WIDE_CHAR_SUPPORT +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ +# include <wchar.h> +# include <wctype.h> +#endif + +#ifdef _LIBC +/* We have to keep the namespace clean. */ +# define regfree(preg) __regfree (preg) +# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) +# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) +# define regerror(errcode, preg, errbuf, errbuf_size) \ + __regerror(errcode, preg, errbuf, errbuf_size) +# define re_set_registers(bu, re, nu, st, en) \ + __re_set_registers (bu, re, nu, st, en) +# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ + __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) +# define re_match(bufp, string, size, pos, regs) \ + __re_match (bufp, string, size, pos, regs) +# define re_search(bufp, string, size, startpos, range, regs) \ + __re_search (bufp, string, size, startpos, range, regs) +# define re_compile_pattern(pattern, length, bufp) \ + __re_compile_pattern (pattern, length, bufp) +# define re_set_syntax(syntax) __re_set_syntax (syntax) +# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ + __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) +# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) + +#define btowc __btowc +#endif + +/* This is for other GNU distributions with internationalized messages. */ +#if HAVE_LIBINTL_H || defined _LIBC +# include <libintl.h> +#else +# define gettext(msgid) (msgid) +#endif + +#ifndef gettext_noop +/* This define is so xgettext can find the internationalizable + strings. */ +# define gettext_noop(String) String +#endif + +/* The `emacs' switch turns on certain matching commands + that make sense only in Emacs. */ +#ifdef emacs + +# include "lisp.h" +# include "buffer.h" +# include "syntax.h" + +#else /* not emacs */ + +/* If we are not linking with Emacs proper, + we can't use the relocating allocator + even if config.h says that we can. */ +# undef REL_ALLOC + +# if defined STDC_HEADERS || defined _LIBC +# include <stdlib.h> +# else +char *malloc(); +char *realloc(); +# endif + +/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. + If nothing else has been done, use the method below. */ +# ifdef INHIBIT_STRING_HEADER +# if !(defined HAVE_BZERO && defined HAVE_BCOPY) +# if !defined bzero && !defined bcopy +# undef INHIBIT_STRING_HEADER +# endif +# endif +# endif + +/* This is the normal way of making sure we have a bcopy and a bzero. + This is used in most programs--a few other programs avoid this + by defining INHIBIT_STRING_HEADER. */ +# ifndef INHIBIT_STRING_HEADER +# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC +# include <string.h> +# ifndef bzero +# ifndef _LIBC +# define bzero(s, n) (memset (s, '\0', n), (s)) +# else +# define bzero(s, n) __bzero (s, n) +# endif +# endif +# else +# include <strings.h> +# ifndef memcmp +# define memcmp(s1, s2, n) bcmp (s1, s2, n) +# endif +# ifndef memcpy +# define memcpy(d, s, n) (bcopy (s, d, n), (d)) +# endif +# endif +# endif + +/* Define the syntax stuff for \<, \>, etc. */ + +/* This must be nonzero for the wordchar and notwordchar pattern + commands in re_match_2. */ +# ifndef Sword +# define Sword 1 +# endif + +# ifdef SWITCH_ENUM_BUG +# define SWITCH_ENUM_CAST(x) ((int)(x)) +# else +# define SWITCH_ENUM_CAST(x) (x) +# endif + +#endif /* not emacs */ + +/* Get the interface, including the syntax bits. */ +#include <regex.h> + +/* isalpha etc. are used for the character classes. */ +#include <ctype.h> + +/* Jim Meyering writes: + + "... Some ctype macros are valid only for character codes that + isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when + using /bin/cc or gcc but without giving an ansi option). So, all + ctype uses should be through macros like ISPRINT... If + STDC_HEADERS is defined, then autoconf has verified that the ctype + macros don't need to be guarded with references to isascii. ... + Defining isascii to 1 should let any compiler worth its salt + eliminate the && through constant folding." + Solaris defines some of these symbols so we must undefine them first. */ + +#undef ISASCII +#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) +# define ISASCII(c) 1 +#else +# define ISASCII(c) isascii(c) +#endif + +#ifdef isblank +# define ISBLANK(c) (ISASCII (c) && isblank (c)) +#else +# define ISBLANK(c) ((c) == ' ' || (c) == '\t') +#endif +#ifdef isgraph +# define ISGRAPH(c) (ISASCII (c) && isgraph (c)) +#else +# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) +#endif + +#undef ISPRINT +#define ISPRINT(c) (ISASCII (c) && isprint (c)) +#define ISDIGIT(c) (ISASCII (c) && isdigit (c)) +#define ISALNUM(c) (ISASCII (c) && isalnum (c)) +#define ISALPHA(c) (ISASCII (c) && isalpha (c)) +#define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) +#define ISLOWER(c) (ISASCII (c) && islower (c)) +#define ISPUNCT(c) (ISASCII (c) && ispunct (c)) +#define ISSPACE(c) (ISASCII (c) && isspace (c)) +#define ISUPPER(c) (ISASCII (c) && isupper (c)) +#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) + +#ifdef _tolower +# define TOLOWER(c) _tolower(c) +#else +# define TOLOWER(c) tolower(c) +#endif + +#ifndef NULL +# define NULL (void *)0 +#endif + +/* We remove any previous definition of `SIGN_EXTEND_CHAR', + since ours (we hope) works properly with all combinations of + machines, compilers, `char' and `unsigned char' argument types. + (Per Bothner suggested the basic approach.) */ +#undef SIGN_EXTEND_CHAR +#if __STDC__ +# define SIGN_EXTEND_CHAR(c) ((signed char) (c)) +#else /* not __STDC__ */ +/* As in Harbison and Steele. */ +# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) +#endif + +#ifndef emacs +/* How many characters in the character set. */ +# define CHAR_SET_SIZE 256 + +# ifdef SYNTAX_TABLE + +extern char *re_syntax_table; + +# else /* not SYNTAX_TABLE */ + +static char re_syntax_table[CHAR_SET_SIZE]; + +static void init_syntax_once() +{ + register int c; + static int done = 0; + + if (done) + return; + bzero(re_syntax_table, sizeof re_syntax_table); + + for (c = 0; c < CHAR_SET_SIZE; ++c) + if (ISALNUM(c)) + re_syntax_table[c] = Sword; + + re_syntax_table['_'] = Sword; + + done = 1; +} + +# endif /* not SYNTAX_TABLE */ + +# define SYNTAX(c) re_syntax_table[((c) & 0xFF)] + +#endif /* emacs */ + +/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we + use `alloca' instead of `malloc'. This is because using malloc in + re_search* or re_match* could cause memory leaks when C-g is used in + Emacs; also, malloc is slower and causes storage fragmentation. On + the other hand, malloc is more portable, and easier to debug. + + Because we sometimes use alloca, some routines have to be macros, + not functions -- `alloca'-allocated space disappears at the end of the + function it is called in. */ + +#ifdef REGEX_MALLOC + +# define REGEX_ALLOCATE malloc +# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) +# define REGEX_FREE free + +#else /* not REGEX_MALLOC */ + +/* Emacs already defines alloca, sometimes. */ +# ifndef alloca + +/* Make alloca work the best possible way. */ +# ifdef __GNUC__ +# define alloca __builtin_alloca +# else /* not __GNUC__ */ +# if HAVE_ALLOCA_H +# include <alloca.h> +# endif /* HAVE_ALLOCA_H */ +# endif /* not __GNUC__ */ + +# endif /* not alloca */ + +# define REGEX_ALLOCATE alloca + +/* Assumes a `char *destination' variable. */ +# define REGEX_REALLOCATE(source, osize, nsize) \ + (destination = (char *) alloca (nsize), \ + memcpy (destination, source, osize)) + +/* No need to do anything to free, after alloca. */ +# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ + +#endif /* not REGEX_MALLOC */ + +/* Define how to allocate the failure stack. */ + +#if defined REL_ALLOC && defined REGEX_MALLOC + +# define REGEX_ALLOCATE_STACK(size) \ + r_alloc (&failure_stack_ptr, (size)) +# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ + r_re_alloc (&failure_stack_ptr, (nsize)) +# define REGEX_FREE_STACK(ptr) \ + r_alloc_free (&failure_stack_ptr) + +#else /* not using relocating allocator */ + +# ifdef REGEX_MALLOC + +# define REGEX_ALLOCATE_STACK malloc +# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) +# define REGEX_FREE_STACK free + +# else /* not REGEX_MALLOC */ + +# define REGEX_ALLOCATE_STACK alloca + +# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ + REGEX_REALLOCATE (source, osize, nsize) +/* No need to explicitly free anything. */ +# define REGEX_FREE_STACK(arg) + +# endif /* not REGEX_MALLOC */ +#endif /* not using relocating allocator */ + + +/* True if `size1' is non-NULL and PTR is pointing anywhere inside + `string1' or just past its end. This works if PTR is NULL, which is + a good thing. */ +#define FIRST_STRING_P(ptr) \ + (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) + +/* (Re)Allocate N items of type T using malloc, or fail. */ +#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) +#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) +#define RETALLOC_IF(addr, n, t) \ + if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) +#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) + +#define BYTEWIDTH 8 /* In bits. */ + +#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) + +#undef MAX +#undef MIN +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef char boolean; + +#define false 0 +#define true 1 + +static int re_match_2_internal PARAMS((struct re_pattern_buffer * bufp, + const char *string1, int size1, + const char *string2, int size2, + int pos, + struct re_registers * regs, + + int stop)); + +/* These are the command codes that appear in compiled regular + expressions. Some opcodes are followed by argument bytes. A + command code can specify any interpretation whatsoever for its + arguments. Zero bytes may appear in the compiled regular expression. */ + +typedef enum { + no_op = 0, + + /* Succeed right away--no more backtracking. */ + succeed, + + /* Followed by one byte giving n, then by n literal bytes. */ + exactn, + + /* Matches any (more or less) character. */ + anychar, + + /* Matches any one char belonging to specified set. First + following byte is number of bitmap bytes. Then come bytes + for a bitmap saying which chars are in. Bits in each byte + are ordered low-bit-first. A character is in the set if its + bit is 1. A character too large to have a bit in the map is + automatically not in the set. */ + charset, + + /* Same parameters as charset, but match any character that is + not one of those specified. */ + charset_not, + + /* Start remembering the text that is matched, for storing in a + register. Followed by one byte with the register number, in + the range 0 to one less than the pattern buffer's re_nsub + field. Then followed by one byte with the number of groups + inner to this one. (This last has to be part of the + start_memory only because we need it in the on_failure_jump + of re_match_2.) */ + start_memory, + + /* Stop remembering the text that is matched and store it in a + memory register. Followed by one byte with the register + number, in the range 0 to one less than `re_nsub' in the + pattern buffer, and one byte with the number of inner groups, + just like `start_memory'. (We need the number of inner + groups here because we don't have any easy way of finding the + corresponding start_memory when we're at a stop_memory.) */ + stop_memory, + + /* Match a duplicate of something remembered. Followed by one + byte containing the register number. */ + duplicate, + + /* Fail unless at beginning of line. */ + begline, + + /* Fail unless at end of line. */ + endline, + + /* Succeeds if at beginning of buffer (if emacs) or at beginning + of string to be matched (if not). */ + begbuf, + + /* Analogously, for end of buffer/string. */ + endbuf, + + /* Followed by two byte relative address to which to jump. */ + jump, + + /* Same as jump, but marks the end of an alternative. */ + jump_past_alt, + + /* Followed by two-byte relative address of place to resume at + in case of failure. */ + on_failure_jump, + + /* Like on_failure_jump, but pushes a placeholder instead of the + current string position when executed. */ + on_failure_keep_string_jump, + + /* Throw away latest failure point and then jump to following + two-byte relative address. */ + pop_failure_jump, + + /* Change to pop_failure_jump if know won't have to backtrack to + match; otherwise change to jump. This is used to jump + back to the beginning of a repeat. If what follows this jump + clearly won't match what the repeat does, such that we can be + sure that there is no use backtracking out of repetitions + already matched, then we change it to a pop_failure_jump. + Followed by two-byte address. */ + maybe_pop_jump, + + /* Jump to following two-byte address, and push a dummy failure + point. This failure point will be thrown away if an attempt + is made to use it for a failure. A `+' construct makes this + before the first repeat. Also used as an intermediary kind + of jump when compiling an alternative. */ + dummy_failure_jump, + + /* Push a dummy failure point and continue. Used at the end of + alternatives. */ + push_dummy_failure, + + /* Followed by two-byte relative address and two-byte number n. + After matching N times, jump to the address upon failure. */ + succeed_n, + + /* Followed by two-byte relative address, and two-byte number n. + Jump to the address N times, then fail. */ + jump_n, + + /* Set the following two-byte relative address to the + subsequent two-byte number. The address *includes* the two + bytes of number. */ + set_number_at, + + wordchar, /* Matches any word-constituent character. */ + notwordchar, /* Matches any char that is not a word-constituent. */ + + wordbeg, /* Succeeds if at word beginning. */ + wordend, /* Succeeds if at word end. */ + + wordbound, /* Succeeds if at a word boundary. */ + notwordbound /* Succeeds if not at a word boundary. */ +#ifdef emacs + , before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ + + /* Matches any character whose syntax is specified. Followed by + a byte which contains a syntax code, e.g., Sword. */ + syntaxspec, + + /* Matches any character whose syntax is not that specified. */ + notsyntaxspec +#endif /* emacs */ +} re_opcode_t; + +/* Common operations on the compiled pattern. */ + +/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ + +#define STORE_NUMBER(destination, number) \ + do { \ + (destination)[0] = (number) & 0377; \ + (destination)[1] = (number) >> 8; \ + } while (0) + +/* Same as STORE_NUMBER, except increment DESTINATION to + the byte after where the number is stored. Therefore, DESTINATION + must be an lvalue. */ + +#define STORE_NUMBER_AND_INCR(destination, number) \ + do { \ + STORE_NUMBER (destination, number); \ + (destination) += 2; \ + } while (0) + +/* Put into DESTINATION a number stored in two contiguous bytes starting + at SOURCE. */ + +#define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source) & 0377; \ + (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ + } while (0) + +#ifdef DEBUG +static void extract_number _RE_ARGS((int *dest, unsigned char *source)); +static void extract_number(dest, source) +int *dest; +unsigned char *source; +{ + int temp = SIGN_EXTEND_CHAR(*(source + 1)); + + *dest = *source & 0377; + *dest += temp << 8; +} + +# ifndef EXTRACT_MACROS /* To debug the macros. */ +# undef EXTRACT_NUMBER +# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) +# endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. + SOURCE must be an lvalue. */ + +#define EXTRACT_NUMBER_AND_INCR(destination, source) \ + do { \ + EXTRACT_NUMBER (destination, source); \ + (source) += 2; \ + } while (0) + +#ifdef DEBUG +static void extract_number_and_incr _RE_ARGS((int *destination, + unsigned char **source)); +static void extract_number_and_incr(destination, source) +int *destination; +unsigned char **source; +{ + extract_number(destination, *source); + *source += 2; +} + +# ifndef EXTRACT_MACROS +# undef EXTRACT_NUMBER_AND_INCR +# define EXTRACT_NUMBER_AND_INCR(dest, src) \ + extract_number_and_incr (&dest, &src) +# endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* If DEBUG is defined, Regex prints many voluminous messages about what + it is doing (if the variable `debug' is nonzero). If linked with the + main program in `iregex.c', you can enter patterns and strings + interactively. And if linked with the main program in `main.c' and + the other test files, you can run the already-written tests. */ + +#ifdef DEBUG + +/* We use standard I/O for debugging. */ +# include <stdio.h> + +/* It is useful to test things that ``must'' be true when debugging. */ +# include <assert.h> + +static int debug; + +# define DEBUG_STATEMENT(e) e +# define DEBUG_PRINT1(x) if (debug) printf (x) +# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) +# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) +# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) +# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (debug) print_partial_compiled_pattern (s, e) +# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (debug) print_double_string (w, s1, sz1, s2, sz2) + + +/* Print the fastmap in human-readable form. */ + +void print_fastmap(fastmap) +char *fastmap; +{ + unsigned was_a_range = 0; + unsigned i = 0; + + while (i < (1 << BYTEWIDTH)) { + if (fastmap[i++]) { + was_a_range = 0; + putchar(i - 1); + while (i < (1 << BYTEWIDTH) && fastmap[i]) { + was_a_range = 1; + i++; + } + if (was_a_range) { + printf("-"); + putchar(i - 1); + } + } + } + putchar('\n'); +} + + +/* Print a compiled pattern string in human-readable form, starting at + the START pointer into it and ending just before the pointer END. */ + +void print_partial_compiled_pattern(start, end) +unsigned char *start; +unsigned char *end; +{ + int mcnt, mcnt2; + unsigned char *p1; + unsigned char *p = start; + unsigned char *pend = end; + + if (start == NULL) { + printf("(null)\n"); + return; + } + + /* Loop over pattern commands. */ + while (p < pend) { + printf("%d:\t", p - start); + + switch ((re_opcode_t) * p++) { + case no_op: + printf("/no_op"); + break; + + case exactn: + mcnt = *p++; + printf("/exactn/%d", mcnt); + do { + putchar('/'); + putchar(*p++); + } + while (--mcnt); + break; + + case start_memory: + mcnt = *p++; + printf("/start_memory/%d/%d", mcnt, *p++); + break; + + case stop_memory: + mcnt = *p++; + printf("/stop_memory/%d/%d", mcnt, *p++); + break; + + case duplicate: + printf("/duplicate/%d", *p++); + break; + + case anychar: + printf("/anychar"); + break; + + case charset: + case charset_not: + { + register int c, last = -100; + register int in_range = 0; + + printf("/charset [%s", + (re_opcode_t) * (p - 1) == charset_not ? "^" : ""); + + assert(p + *p < pend); + + for (c = 0; c < 256; c++) + if (c / 8 < *p && (p[1 + (c / 8)] & (1 << (c % 8)))) { + /* Are we starting a range? */ + if (last + 1 == c && !in_range) { + putchar('-'); + in_range = 1; + } + /* Have we broken a range? */ + else if (last + 1 != c && in_range) { + putchar(last); + in_range = 0; + } + + if (!in_range) + putchar(c); + + last = c; + } + + if (in_range) + putchar(last); + + putchar(']'); + + p += 1 + *p; + } + break; + + case begline: + printf("/begline"); + break; + + case endline: + printf("/endline"); + break; + + case on_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/on_failure_jump to %d", p + mcnt - start); + break; + + case on_failure_keep_string_jump: + extract_number_and_incr(&mcnt, &p); + printf("/on_failure_keep_string_jump to %d", p + mcnt - start); + break; + + case dummy_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/dummy_failure_jump to %d", p + mcnt - start); + break; + + case push_dummy_failure: + printf("/push_dummy_failure"); + break; + + case maybe_pop_jump: + extract_number_and_incr(&mcnt, &p); + printf("/maybe_pop_jump to %d", p + mcnt - start); + break; + + case pop_failure_jump: + extract_number_and_incr(&mcnt, &p); + printf("/pop_failure_jump to %d", p + mcnt - start); + break; + + case jump_past_alt: + extract_number_and_incr(&mcnt, &p); + printf("/jump_past_alt to %d", p + mcnt - start); + break; + + case jump: + extract_number_and_incr(&mcnt, &p); + printf("/jump to %d", p + mcnt - start); + break; + + case succeed_n: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/succeed_n to %d, %d times", p1 - start, mcnt2); + break; + + case jump_n: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/jump_n to %d, %d times", p1 - start, mcnt2); + break; + + case set_number_at: + extract_number_and_incr(&mcnt, &p); + p1 = p + mcnt; + extract_number_and_incr(&mcnt2, &p); + printf("/set_number_at location %d to %d", p1 - start, mcnt2); + break; + + case wordbound: + printf("/wordbound"); + break; + + case notwordbound: + printf("/notwordbound"); + break; + + case wordbeg: + printf("/wordbeg"); + break; + + case wordend: + printf("/wordend"); + +# ifdef emacs + case before_dot: + printf("/before_dot"); + break; + + case at_dot: + printf("/at_dot"); + break; + + case after_dot: + printf("/after_dot"); + break; + + case syntaxspec: + printf("/syntaxspec"); + mcnt = *p++; + printf("/%d", mcnt); + break; + + case notsyntaxspec: + printf("/notsyntaxspec"); + mcnt = *p++; + printf("/%d", mcnt); + break; +# endif /* emacs */ + + case wordchar: + printf("/wordchar"); + break; + + case notwordchar: + printf("/notwordchar"); + break; + + case begbuf: + printf("/begbuf"); + break; + + case endbuf: + printf("/endbuf"); + break; + + default: + printf("?%d", *(p - 1)); + } + + putchar('\n'); + } + + printf("%d:\tend of pattern.\n", p - start); +} + + +void print_compiled_pattern(bufp) +struct re_pattern_buffer *bufp; +{ + unsigned char *buffer = bufp->buffer; + + print_partial_compiled_pattern(buffer, buffer + bufp->used); + printf("%ld bytes used/%ld bytes allocated.\n", + bufp->used, bufp->allocated); + + if (bufp->fastmap_accurate && bufp->fastmap) { + printf("fastmap: "); + print_fastmap(bufp->fastmap); + } + + printf("re_nsub: %d\t", bufp->re_nsub); + printf("regs_alloc: %d\t", bufp->regs_allocated); + printf("can_be_null: %d\t", bufp->can_be_null); + printf("newline_anchor: %d\n", bufp->newline_anchor); + printf("no_sub: %d\t", bufp->no_sub); + printf("not_bol: %d\t", bufp->not_bol); + printf("not_eol: %d\t", bufp->not_eol); + printf("syntax: %lx\n", bufp->syntax); + /* Perhaps we should print the translate table? */ +} + + +void print_double_string(where, string1, size1, string2, size2) +const char *where; +const char *string1; +const char *string2; +int size1; +int size2; +{ + int this_char; + + if (where == NULL) + printf("(null)"); + else { + if (FIRST_STRING_P(where)) { + for (this_char = where - string1; this_char < size1; + this_char++) + putchar(string1[this_char]); + + where = string2; + } + + for (this_char = where - string2; this_char < size2; this_char++) + putchar(string2[this_char]); + } +} + +void printchar(c) +int c; +{ + putc(c, stderr); +} + +#else /* not DEBUG */ + +# undef assert +# define assert(e) + +# define DEBUG_STATEMENT(e) +# define DEBUG_PRINT1(x) +# define DEBUG_PRINT2(x1, x2) +# define DEBUG_PRINT3(x1, x2, x3) +# define DEBUG_PRINT4(x1, x2, x3, x4) +# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) +# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) + +#endif /* not DEBUG */ + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +/* This has no initializer because initialized variables in Emacs + become read-only after dumping. */ +reg_syntax_t re_syntax_options; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t re_set_syntax(syntax) +reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; +#ifdef DEBUG + if (syntax & RE_DEBUG) + debug = 1; + else if (debug) /* was on but now is not */ + debug = 0; +#endif /* DEBUG */ + return ret; +} + +#ifdef _LIBC +weak_alias(__re_set_syntax, re_set_syntax) +#endif +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. + POSIX doesn't require that we do anything for REG_NOERROR, + but why not be nice? */ +static const char re_error_msgid[] = { +#define REG_NOERROR_IDX 0 + gettext_noop("Success") /* REG_NOERROR */ + "\0" +#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") + gettext_noop("No match") /* REG_NOMATCH */ + "\0" +#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") + gettext_noop("Invalid regular expression") /* REG_BADPAT */ + "\0" +#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") + gettext_noop("Invalid collation character") /* REG_ECOLLATE */ + "\0" +#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") + gettext_noop("Invalid character class name") /* REG_ECTYPE */ + "\0" +#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") + gettext_noop("Trailing backslash") /* REG_EESCAPE */ + "\0" +#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") + gettext_noop("Invalid back reference") /* REG_ESUBREG */ + "\0" +#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") + gettext_noop("Unmatched [ or [^") /* REG_EBRACK */ + "\0" +#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") + gettext_noop("Unmatched ( or \\(") /* REG_EPAREN */ + "\0" +#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") + gettext_noop("Unmatched \\{") /* REG_EBRACE */ + "\0" +#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") + gettext_noop("Invalid content of \\{\\}") /* REG_BADBR */ + "\0" +#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") + gettext_noop("Invalid range end") /* REG_ERANGE */ + "\0" +#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") + gettext_noop("Memory exhausted") /* REG_ESPACE */ + "\0" +#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") + gettext_noop("Invalid preceding regular expression") /* REG_BADRPT */ + "\0" +#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") + gettext_noop("Premature end of regular expression") /* REG_EEND */ + "\0" +#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") + gettext_noop("Regular expression too big") /* REG_ESIZE */ + "\0" +#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") + gettext_noop("Unmatched ) or \\)") /* REG_ERPAREN */ +}; + +static const size_t re_error_msgid_idx[] = { + REG_NOERROR_IDX, + REG_NOMATCH_IDX, + REG_BADPAT_IDX, + REG_ECOLLATE_IDX, + REG_ECTYPE_IDX, + REG_EESCAPE_IDX, + REG_ESUBREG_IDX, + REG_EBRACK_IDX, + REG_EPAREN_IDX, + REG_EBRACE_IDX, + REG_BADBR_IDX, + REG_ERANGE_IDX, + REG_ESPACE_IDX, + REG_BADRPT_IDX, + REG_EEND_IDX, + REG_ESIZE_IDX, + REG_ERPAREN_IDX +}; + +/* Avoiding alloca during matching, to placate r_alloc. */ + +/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the + searching and matching functions should not call alloca. On some + systems, alloca is implemented in terms of malloc, and if we're + using the relocating allocator routines, then malloc could cause a + relocation, which might (if the strings being searched are in the + ralloc heap) shift the data out from underneath the regexp + routines. + + Here's another reason to avoid allocation: Emacs + processes input from X in a signal handler; processing X input may + call malloc; if input arrives while a matching routine is calling + malloc, then we're scrod. But Emacs can't just block input while + calling matching routines; then we don't notice interrupts when + they come in. So, Emacs blocks input around all regexp calls + except the matching calls, which it leaves unprotected, in the + faith that they will not malloc. */ + +/* Normally, this is fine. */ +#define MATCH_MAY_ALLOCATE + +/* When using GNU C, we are not REALLY using the C alloca, no matter + what config.h may say. So don't take precautions for it. */ +#ifdef __GNUC__ +# undef C_ALLOCA +#endif + +/* The match routines may not allocate if (1) they would do it with malloc + and (2) it's not safe for them to use malloc. + Note that if REL_ALLOC is defined, matching would not use malloc for the + failure stack, but we would still use it for the register vectors; + so REL_ALLOC should not affect this. */ +#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs +# undef MATCH_MAY_ALLOCATE +#endif + + +/* Failure stack declarations and macros; both re_compile_fastmap and + re_match_2 use a failure stack. These have to be macros because of + REGEX_ALLOCATE_STACK. */ + + +/* Number of failure points for which to initially allocate space + when matching. If this number is exceeded, we allocate more + space, so it is not a hard limit. */ +#ifndef INIT_FAILURE_ALLOC +# define INIT_FAILURE_ALLOC 5 +#endif + +/* Roughly the maximum number of failure points on the stack. Would be + exactly that if always used MAX_FAILURE_ITEMS items each time we failed. + This is a variable only so users of regex can assign to it; we never + change it ourselves. */ + +#ifdef INT_IS_16BIT + +# if defined MATCH_MAY_ALLOCATE +/* 4400 was enough to cause a crash on Alpha OSF/1, + whose default stack limit is 2mb. */ +long int re_max_failures = 4000; +# else +long int re_max_failures = 2000; +# endif + +union fail_stack_elt { + unsigned char *pointer; + long int integer; +}; + +typedef union fail_stack_elt fail_stack_elt_t; + +typedef struct { + fail_stack_elt_t *stack; + unsigned long int size; + unsigned long int avail; /* Offset of next open position. */ +} fail_stack_type; + +#else /* not INT_IS_16BIT */ + +# if defined MATCH_MAY_ALLOCATE +/* 4400 was enough to cause a crash on Alpha OSF/1, + whose default stack limit is 2mb. */ +int re_max_failures = 20000; +# else +int re_max_failures = 2000; +# endif + +union fail_stack_elt { + unsigned char *pointer; + int integer; +}; + +typedef union fail_stack_elt fail_stack_elt_t; + +typedef struct { + fail_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} fail_stack_type; + +#endif /* INT_IS_16BIT */ + +#define FAIL_STACK_EMPTY() (fail_stack.avail == 0) +#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) +#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) + + +/* Define macros to initialize and free the failure stack. + Do `return -2' if the alloc fails. */ + +#ifdef MATCH_MAY_ALLOCATE +# define INIT_FAIL_STACK() \ + do { \ + fail_stack.stack = (fail_stack_elt_t *) \ + REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + \ + if (fail_stack.stack == NULL) \ + return -2; \ + \ + fail_stack.size = INIT_FAILURE_ALLOC; \ + fail_stack.avail = 0; \ + } while (0) + +# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) +#else +# define INIT_FAIL_STACK() \ + do { \ + fail_stack.avail = 0; \ + } while (0) + +# define RESET_FAIL_STACK() +#endif + + +/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. + + Return 1 if succeeds, and 0 if either ran out of memory + allocating space for it or it was already too large. + + REGEX_REALLOCATE_STACK requires `destination' be declared. */ + +#define DOUBLE_FAIL_STACK(fail_stack) \ + ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ + ? 0 \ + : ((fail_stack).stack = (fail_stack_elt_t *) \ + REGEX_REALLOCATE_STACK ((fail_stack).stack, \ + (fail_stack).size * sizeof (fail_stack_elt_t), \ + ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + \ + (fail_stack).stack == NULL \ + ? 0 \ + : ((fail_stack).size <<= 1, \ + 1))) + + +/* Push pointer POINTER on FAIL_STACK. + Return 1 if was able to do so and 0 if ran out of memory allocating + space to do so. */ +#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ + ((FAIL_STACK_FULL () \ + && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ + ? 0 \ + : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ + 1)) + +/* Push a pointer value onto the failure stack. + Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_POINTER(item) \ + fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) + +/* This pushes an integer-valued item onto the failure stack. + Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_INT(item) \ + fail_stack.stack[fail_stack.avail++].integer = (item) + +/* Push a fail_stack_elt_t value onto the failure stack. + Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_ELT(item) \ + fail_stack.stack[fail_stack.avail++] = (item) + +/* These three POP... operations complement the three PUSH... operations. + All assume that `fail_stack' is nonempty. */ +#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer +#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer +#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] + +/* Used to omit pushing failure point id's when we're not debugging. */ +#ifdef DEBUG +# define DEBUG_PUSH PUSH_FAILURE_INT +# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () +#else +# define DEBUG_PUSH(item) +# define DEBUG_POP(item_addr) +#endif + + +/* Push the information about the state we will need + if we ever fail back to it. + + Requires variables fail_stack, regstart, regend, reg_info, and + num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination' + be declared. + + Does `return FAILURE_CODE' if runs out of memory. */ + +#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ + do { \ + char *destination; \ + /* Must be int, so when we don't save any registers, the arithmetic \ + of 0 + -1 isn't done as unsigned. */ \ + /* Can't be int, since there is not a shred of a guarantee that int \ + is wide enough to hold a value of something to which pointer can \ + be assigned */ \ + active_reg_t this_reg; \ + \ + DEBUG_STATEMENT (failure_id++); \ + DEBUG_STATEMENT (nfailure_points_pushed++); \ + DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ + DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ + DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ + \ + DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \ + DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ + \ + /* Ensure we have enough space allocated for what we will push. */ \ + while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ + { \ + if (!DOUBLE_FAIL_STACK (fail_stack)) \ + return failure_code; \ + \ + DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ + (fail_stack).size); \ + DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ + } \ + \ + /* Push the info, starting with the registers. */ \ + DEBUG_PRINT1 ("\n"); \ + \ + if (1) \ + for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ + this_reg++) \ + { \ + DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \ + DEBUG_STATEMENT (num_regs_pushed++); \ + \ + DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ + PUSH_FAILURE_POINTER (regstart[this_reg]); \ + \ + DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ + PUSH_FAILURE_POINTER (regend[this_reg]); \ + \ + DEBUG_PRINT2 (" info: %p\n ", \ + reg_info[this_reg].word.pointer); \ + DEBUG_PRINT2 (" match_null=%d", \ + REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ + DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ + DEBUG_PRINT2 (" matched_something=%d", \ + MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT2 (" ever_matched=%d", \ + EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT1 ("\n"); \ + PUSH_FAILURE_ELT (reg_info[this_reg].word); \ + } \ + \ + DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\ + PUSH_FAILURE_INT (lowest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\ + PUSH_FAILURE_INT (highest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ + PUSH_FAILURE_POINTER (pattern_place); \ + \ + DEBUG_PRINT2 (" Pushing string %p: `", string_place); \ + DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ + size2); \ + DEBUG_PRINT1 ("'\n"); \ + PUSH_FAILURE_POINTER (string_place); \ + \ + DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ + DEBUG_PUSH (failure_id); \ + } while (0) + +/* This is the number of items that are pushed and popped on the stack + for each register. */ +#define NUM_REG_ITEMS 3 + +/* Individual items aside from the registers. */ +#ifdef DEBUG +# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ +#else +# define NUM_NONREG_ITEMS 4 +#endif + +/* We push at most this many items on the stack. */ +/* We used to use (num_regs - 1), which is the number of registers + this regexp will save; but that was changed to 5 + to avoid stack overflow for a regexp with lots of parens. */ +#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) + +/* We actually push this many items. */ +#define NUM_FAILURE_ITEMS \ + (((0 \ + ? 0 : highest_active_reg - lowest_active_reg + 1) \ + * NUM_REG_ITEMS) \ + + NUM_NONREG_ITEMS) + +/* How many items can still be added to the stack without overflowing it. */ +#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) + + +/* Pops what PUSH_FAIL_STACK pushes. + + We restore into the parameters, all of which should be lvalues: + STR -- the saved data position. + PAT -- the saved pattern position. + LOW_REG, HIGH_REG -- the highest and lowest active registers. + REGSTART, REGEND -- arrays of string positions. + REG_INFO -- array of information about each subexpression. + + Also assumes the variables `fail_stack' and (if debugging), `bufp', + `pend', `string1', `size1', `string2', and `size2'. */ + +#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ +{ \ + DEBUG_STATEMENT (unsigned failure_id;) \ + active_reg_t this_reg; \ + const unsigned char *string_temp; \ + \ + assert (!FAIL_STACK_EMPTY ()); \ + \ + /* Remove failure points and point to how many regs pushed. */ \ + DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ + DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ + DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ + \ + assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ + \ + DEBUG_POP (&failure_id); \ + DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ + \ + /* If the saved string location is NULL, it came from an \ + on_failure_keep_string_jump opcode, and we want to throw away the \ + saved NULL, thus retaining our current position in the string. */ \ + string_temp = POP_FAILURE_POINTER (); \ + if (string_temp != NULL) \ + str = (const char *) string_temp; \ + \ + DEBUG_PRINT2 (" Popping string %p: `", str); \ + DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ + DEBUG_PRINT1 ("'\n"); \ + \ + pat = (unsigned char *) POP_FAILURE_POINTER (); \ + DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ + \ + /* Restore register info. */ \ + high_reg = (active_reg_t) POP_FAILURE_INT (); \ + DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \ + \ + low_reg = (active_reg_t) POP_FAILURE_INT (); \ + DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \ + \ + if (1) \ + for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ + { \ + DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \ + \ + reg_info[this_reg].word = POP_FAILURE_ELT (); \ + DEBUG_PRINT2 (" info: %p\n", \ + reg_info[this_reg].word.pointer); \ + \ + regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ + DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ + \ + regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ + DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ + } \ + else \ + { \ + for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ + { \ + reg_info[this_reg].word.integer = 0; \ + regend[this_reg] = 0; \ + regstart[this_reg] = 0; \ + } \ + highest_active_reg = high_reg; \ + } \ + \ + set_regs_matched_done = 0; \ + DEBUG_STATEMENT (nfailure_points_popped++); \ +} /* POP_FAILURE_POINT */ + + + +/* Structure for per-register (a.k.a. per-group) information. + Other register information, such as the + starting and ending positions (which are addresses), and the list of + inner groups (which is a bits list) are maintained in separate + variables. + + We are making a (strictly speaking) nonportable assumption here: that + the compiler will pack our bit fields into something that fits into + the type of `word', i.e., is something that fits into one item on the + failure stack. */ + + +/* Declarations and macros for re_match_2. */ + +typedef union { + fail_stack_elt_t word; + struct { + /* This field is one if this group can match the empty string, + zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ +#define MATCH_NULL_UNSET_VALUE 3 + unsigned match_null_string_p:2; + unsigned is_active:1; + unsigned matched_something:1; + unsigned ever_matched_something:1; + } bits; +} register_info_type; + +#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) +#define IS_ACTIVE(R) ((R).bits.is_active) +#define MATCHED_SOMETHING(R) ((R).bits.matched_something) +#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) + + +/* Call this when have matched a real character; it sets `matched' flags + for the subexpressions which we are currently inside. Also records + that those subexprs have matched. */ +#define SET_REGS_MATCHED() \ + do \ + { \ + if (!set_regs_matched_done) \ + { \ + active_reg_t r; \ + set_regs_matched_done = 1; \ + for (r = lowest_active_reg; r <= highest_active_reg; r++) \ + { \ + MATCHED_SOMETHING (reg_info[r]) \ + = EVER_MATCHED_SOMETHING (reg_info[r]) \ + = 1; \ + } \ + } \ + } \ + while (0) + +/* Registers are set to a sentinel when they haven't yet matched. */ +static char reg_unset_dummy; + +#define REG_UNSET_VALUE (®_unset_dummy) +#define REG_UNSET(e) ((e) == REG_UNSET_VALUE) + +/* Subroutine declarations and macros for regex_compile. */ + +static reg_errcode_t regex_compile +_RE_ARGS( + (const char *pattern, size_t size, reg_syntax_t syntax, + struct re_pattern_buffer * bufp)); +static void store_op1 + +_RE_ARGS((re_opcode_t op, unsigned char *loc, int arg)); +static void store_op2 +_RE_ARGS((re_opcode_t op, unsigned char *loc, int arg1, int arg2)); +static void insert_op1 +_RE_ARGS( + + (re_opcode_t op, unsigned char *loc, int arg, + unsigned char *end)); +static void insert_op2 +_RE_ARGS( + (re_opcode_t op, unsigned char *loc, int arg1, int arg2, + + unsigned char *end)); +static boolean at_begline_loc_p +_RE_ARGS((const char *pattern, const char *p, reg_syntax_t syntax)); +static boolean at_endline_loc_p +_RE_ARGS((const char *p, const char *pend, reg_syntax_t syntax)); +static reg_errcode_t compile_range +_RE_ARGS( + (const char **p_ptr, const char *pend, char *translate, + reg_syntax_t syntax, unsigned char *b)); + +/* Fetch the next character in the uncompiled pattern---translating it + if necessary. Also cast from a signed character in the constant + string passed to us by the user to an unsigned char that we can use + as an array index (in, e.g., `translate'). */ +#ifndef PATFETCH +# define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + if (translate) c = (unsigned char) translate[c]; \ + } while (0) +#endif + +/* Fetch the next character in the uncompiled pattern, with no + translation. */ +#define PATFETCH_RAW(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + } while (0) + +/* Go backwards one character in the pattern. */ +#define PATUNFETCH p-- + + +/* If `translate' is non-null, return translate[D], else just D. We + cast the subscript to translate because some data is declared as + `char *', to avoid warnings when a string constant is passed. But + when we use a character as a subscript we must make it unsigned. */ +#ifndef TRANSLATE +# define TRANSLATE(d) \ + (translate ? (char) translate[(unsigned char) (d)] : (d)) +#endif + + +/* Macros for outputting the compiled pattern into `buffer'. */ + +/* If the buffer isn't allocated when it comes in, use this. */ +#define INIT_BUF_SIZE 32 + +/* Make sure we have at least N more bytes of space in buffer. */ +#define GET_BUFFER_SPACE(n) \ + while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ + EXTEND_BUFFER () + +/* Make sure we have one more byte of buffer space and then add C to it. */ +#define BUF_PUSH(c) \ + do { \ + GET_BUFFER_SPACE (1); \ + *b++ = (unsigned char) (c); \ + } while (0) + + +/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ +#define BUF_PUSH_2(c1, c2) \ + do { \ + GET_BUFFER_SPACE (2); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + } while (0) + + +/* As with BUF_PUSH_2, except for three bytes. */ +#define BUF_PUSH_3(c1, c2, c3) \ + do { \ + GET_BUFFER_SPACE (3); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + *b++ = (unsigned char) (c3); \ + } while (0) + + +/* Store a jump with opcode OP at LOC to location TO. We store a + relative address offset by the three bytes the jump itself occupies. */ +#define STORE_JUMP(op, loc, to) \ + store_op1 (op, loc, (int) ((to) - (loc) - 3)) + +/* Likewise, for a two-argument jump. */ +#define STORE_JUMP2(op, loc, to, arg) \ + store_op2 (op, loc, (int) ((to) - (loc) - 3), arg) + +/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP(op, loc, to) \ + insert_op1 (op, loc, (int) ((to) - (loc) - 3), b) + +/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP2(op, loc, to, arg) \ + insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b) + + +/* This is not an arbitrary limit: the arguments which represent offsets + into the pattern are two bytes long. So if 2^16 bytes turns out to + be too small, many things would have to change. */ +/* Any other compiler which, like MSC, has allocation limit below 2^16 + bytes will have to use approach similar to what was done below for + MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up + reallocating to 0 bytes. Such thing is not going to work too well. + You have been warned!! */ +#if defined _MSC_VER && !defined WIN32 +/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. + The REALLOC define eliminates a flurry of conversion warnings, + but is not required. */ +# define MAX_BUF_SIZE 65500L +# define REALLOC(p,s) realloc ((p), (size_t) (s)) +#else +# define MAX_BUF_SIZE (1L << 16) +# define REALLOC(p,s) realloc ((p), (s)) +#endif + +/* Extend the buffer by twice its current size via realloc and + reset the pointers that pointed into the old block to point to the + correct places in the new one. If extending the buffer results in it + being larger than MAX_BUF_SIZE, then flag memory exhausted. */ +#define EXTEND_BUFFER() \ + do { \ + unsigned char *old_buffer = bufp->buffer; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\ + if (bufp->buffer == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != bufp->buffer) \ + { \ + b = (b - old_buffer) + bufp->buffer; \ + begalt = (begalt - old_buffer) + bufp->buffer; \ + if (fixup_alt_jump) \ + fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ + if (laststart) \ + laststart = (laststart - old_buffer) + bufp->buffer; \ + if (pending_exact) \ + pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ + } \ + } while (0) + + +/* Since we have one byte reserved for the register number argument to + {start,stop}_memory, the maximum number of groups we can report + things about is what fits in that byte. */ +#define MAX_REGNUM 255 + +/* But patterns can have more than `MAX_REGNUM' registers. We just + ignore the excess. */ +typedef unsigned regnum_t; + + +/* Macros for the compile stack. */ + +/* Since offsets can go either forwards or backwards, this type needs to + be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ +/* int may be not enough when sizeof(int) == 2. */ +typedef long pattern_offset_t; + +typedef struct { + pattern_offset_t begalt_offset; + pattern_offset_t fixup_alt_jump; + pattern_offset_t inner_group_offset; + pattern_offset_t laststart_offset; + regnum_t regnum; +} compile_stack_elt_t; + + +typedef struct { + compile_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} compile_stack_type; + + +#define INIT_COMPILE_STACK_SIZE 32 + +#define COMPILE_STACK_EMPTY (compile_stack.avail == 0) +#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) + +/* The next available element. */ +#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) + + +/* Set the bit for character C in a list. */ +#define SET_LIST_BIT(c) \ + (b[((unsigned char) (c)) / BYTEWIDTH] \ + |= 1 << (((unsigned char) c) % BYTEWIDTH)) + + +/* Get the next unsigned number in the uncompiled pattern. */ +#define GET_UNSIGNED_NUMBER(num) \ + { if (p != pend) \ + { \ + PATFETCH (c); \ + while ('0' <= c && c <= '9') \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + if (p == pend) \ + break; \ + PATFETCH (c); \ + } \ + } \ + } + +#if defined _LIBC || WIDE_CHAR_SUPPORT +/* The GNU C library provides support for user-defined character classes + and the functions from ISO C amendement 1. */ +# ifdef CHARCLASS_NAME_MAX +# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX +# else +/* This shouldn't happen but some implementation might still have this + problem. Use a reasonable default value. */ +# define CHAR_CLASS_MAX_LENGTH 256 +# endif + +# ifdef _LIBC +# define IS_CHAR_CLASS(string) __wctype (string) +# else +# define IS_CHAR_CLASS(string) wctype (string) +# endif +#else +# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ + +# define IS_CHAR_CLASS(string) \ + (STREQ (string, "alpha") || STREQ (string, "upper") \ + || STREQ (string, "lower") || STREQ (string, "digit") \ + || STREQ (string, "alnum") || STREQ (string, "xdigit") \ + || STREQ (string, "space") || STREQ (string, "print") \ + || STREQ (string, "punct") || STREQ (string, "graph") \ + || STREQ (string, "cntrl") || STREQ (string, "blank")) +#endif + +#ifndef MATCH_MAY_ALLOCATE + +/* If we cannot allocate large objects within re_match_2_internal, + we make the fail stack and register vectors global. + The fail stack, we grow to the maximum size when a regexp + is compiled. + The register vectors, we adjust in size each time we + compile a regexp, according to the number of registers it needs. */ + +static fail_stack_type fail_stack; + +/* Size with which the following vectors are currently allocated. + That is so we can make them bigger as needed, + but never make them smaller. */ +static int regs_allocated_size; + +static const char **regstart, **regend; +static const char **old_regstart, **old_regend; +static const char **best_regstart, **best_regend; +static register_info_type *reg_info; +static const char **reg_dummy; +static register_info_type *reg_info_dummy; + +/* Make the register vectors big enough for NUM_REGS registers, + but don't make them smaller. */ + +static regex_grow_registers(num_regs) +int num_regs; +{ + if (num_regs > regs_allocated_size) { + RETALLOC_IF(regstart, num_regs, const char *); + RETALLOC_IF(regend, num_regs, const char *); + RETALLOC_IF(old_regstart, num_regs, const char *); + RETALLOC_IF(old_regend, num_regs, const char *); + RETALLOC_IF(best_regstart, num_regs, const char *); + RETALLOC_IF(best_regend, num_regs, const char *); + + RETALLOC_IF(reg_info, num_regs, register_info_type); + RETALLOC_IF(reg_dummy, num_regs, const char *); + + RETALLOC_IF(reg_info_dummy, num_regs, register_info_type); + + regs_allocated_size = num_regs; + } +} + +#endif /* not MATCH_MAY_ALLOCATE */ + +static boolean group_in_compile_stack _RE_ARGS((compile_stack_type + compile_stack, + + regnum_t regnum)); + +/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. + Returns one of error codes defined in `regex.h', or zero for success. + + Assumes the `allocated' (and perhaps `buffer') and `translate' + fields are set in BUFP on entry. + + If it succeeds, results are put in BUFP (if it returns an error, the + contents of BUFP are undefined): + `buffer' is the compiled pattern; + `syntax' is set to SYNTAX; + `used' is set to the length of the compiled pattern; + `fastmap_accurate' is zero; + `re_nsub' is the number of subexpressions in PATTERN; + `not_bol' and `not_eol' are zero; + + The `fastmap' and `newline_anchor' fields are neither + examined nor set. */ + +/* Return, freeing storage we allocated. */ +#define FREE_STACK_RETURN(value) \ + return (free (compile_stack.stack), value) + +static reg_errcode_t regex_compile(pattern, size, syntax, bufp) +const char *pattern; +size_t size; +reg_syntax_t syntax; +struct re_pattern_buffer *bufp; +{ + /* We fetch characters from PATTERN here. Even though PATTERN is + `char *' (i.e., signed), we declare these variables as unsigned, so + they can be reliably used as array indices. */ + register unsigned char c, c1; + + /* A random temporary spot in PATTERN. */ + const char *p1; + + /* Points to the end of the buffer, where we should append. */ + register unsigned char *b; + + /* Keeps track of unclosed groups. */ + compile_stack_type compile_stack; + + /* Points to the current (ending) position in the pattern. */ + const char *p = pattern; + const char *pend = pattern + size; + + /* How to translate the characters in the pattern. */ + RE_TRANSLATE_TYPE translate = bufp->translate; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell if a new exact-match + character can be added to that command or if the character requires + a new `exactn' command. */ + unsigned char *pending_exact = 0; + + /* Address of start of the most recently finished expression. + This tells, e.g., postfix * where to find the start of its + operand. Reset at the beginning of groups and alternatives. */ + unsigned char *laststart = 0; + + /* Address of beginning of regexp, or inside of last group. */ + unsigned char *begalt; + + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + const char *beg_interval; + + /* Address of the place where a forward jump should go to the end of + the containing expression. Each alternative of an `or' -- except the + last -- ends with a forward jump of this sort. */ + unsigned char *fixup_alt_jump = 0; + + /* Counts open-groups as they are encountered. Remembered for the + matching close-group on the compile stack, so the same register + number is put in the stop_memory as the start_memory. */ + regnum_t regnum = 0; + +#ifdef DEBUG + DEBUG_PRINT1("\nCompiling pattern: "); + if (debug) { + unsigned debug_count; + + for (debug_count = 0; debug_count < size; debug_count++) + putchar(pattern[debug_count]); + putchar('\n'); + } +#endif /* DEBUG */ + + /* Initialize the compile stack. */ + compile_stack.stack = + TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size = INIT_COMPILE_STACK_SIZE; + compile_stack.avail = 0; + + /* Initialize the pattern buffer. */ + bufp->syntax = syntax; + bufp->fastmap_accurate = 0; + bufp->not_bol = bufp->not_eol = 0; + + /* Set `used' to zero, so that if we return an error, the pattern + printer (for debugging) will think there's no pattern. We reset it + at the end. */ + bufp->used = 0; + + /* Always count groups, whether or not bufp->no_sub is set. */ + bufp->re_nsub = 0; + +#if !defined emacs && !defined SYNTAX_TABLE + /* Initialize the syntax table. */ + init_syntax_once(); +#endif + + if (bufp->allocated == 0) { + if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. */ + RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char); + } else { /* Caller did not allocate a buffer. Do it for them. */ + bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char); + } + if (!bufp->buffer) + FREE_STACK_RETURN(REG_ESPACE); + + bufp->allocated = INIT_BUF_SIZE; + } + + begalt = b = bufp->buffer; + + /* Loop through the uncompiled pattern until we're at the end. */ + while (p != pend) { + PATFETCH(c); + + switch (c) { + case '^': + { + if ( /* If at start of pattern, it's an operator. */ + p == pattern + 1 + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's come before. */ + || at_begline_loc_p(pattern, p, syntax)) + BUF_PUSH(begline); + else + goto normal_char; + } + break; + + + case '$': + { + if ( /* If at end of pattern, it's an operator. */ + p == pend + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's next. */ + || at_endline_loc_p(p, pend, syntax)) + BUF_PUSH(endline); + else + goto normal_char; + } + break; + + + case '+': + case '?': + if ((syntax & RE_BK_PLUS_QM) + || (syntax & RE_LIMITED_OPS)) + goto normal_char; + handle_plus: + case '*': + /* If there is no previous pattern... */ + if (!laststart) { + if (syntax & RE_CONTEXT_INVALID_OPS) + FREE_STACK_RETURN(REG_BADRPT); + else if (!(syntax & RE_CONTEXT_INDEP_OPS)) + goto normal_char; + } + + { + /* Are we optimizing this jump? */ + boolean keep_string_p = false; + + /* 1 means zero (many) matches is allowed. */ + char zero_times_ok = 0, many_times_ok = 0; + + /* If there is a sequence of repetition chars, collapse it + down to just one (the right one). We can't combine + interval operators with these because of, e.g., `a{2}*', + which should only match an even number of `a's. */ + + for (;;) { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + + if (p == pend) + break; + + PATFETCH(c); + + if (c == '*' + || (!(syntax & RE_BK_PLUS_QM) + && (c == '+' || c == '?'))); + + else if (syntax & RE_BK_PLUS_QM && c == '\\') { + if (p == pend) + FREE_STACK_RETURN(REG_EESCAPE); + + PATFETCH(c1); + if (!(c1 == '+' || c1 == '?')) { + PATUNFETCH; + PATUNFETCH; + break; + } + + c = c1; + } else { + PATUNFETCH; + break; + } + + /* If we get here, we found another repeat character. */ + } + + /* Star, etc. applied to an empty pattern is equivalent + to an empty pattern. */ + if (!laststart) + break; + + /* Now we know whether or not zero matches is allowed + and also whether or not two or more matches is allowed. */ + if (many_times_ok) { /* More than one repetition is allowed, so put in at the + end a backward relative jump from `b' to before the next + jump we're going to put in below (which jumps from + laststart to after this jump). + + But if we are at the `*' in the exact sequence `.*\n', + insert an unconditional jump backwards to the ., + instead of the beginning of the loop. This way we only + push a failure point once, instead of every time + through the loop. */ + assert(p - 1 > pattern); + + /* Allocate the space for the jump. */ + GET_BUFFER_SPACE(3); + + /* We know we are not at the first character of the pattern, + because laststart was nonzero. And we've already + incremented `p', by the way, to be the character after + the `*'. Do we have to do something analogous here + for null bytes, because of RE_DOT_NOT_NULL? */ + if (TRANSLATE(*(p - 2)) == TRANSLATE('.') + && zero_times_ok + && p < pend && TRANSLATE(*p) == TRANSLATE('\n') + && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ + STORE_JUMP(jump, b, laststart); + keep_string_p = true; + } else + /* Anything else. */ + STORE_JUMP(maybe_pop_jump, b, laststart - 3); + + /* We've added more stuff to the buffer. */ + b += 3; + } + + /* On failure, jump from laststart to b + 3, which will be the + end of the buffer after this jump is inserted. */ + GET_BUFFER_SPACE(3); + INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump + : on_failure_jump, laststart, b + 3); + pending_exact = 0; + b += 3; + + if (!zero_times_ok) { + /* At least one repetition is required, so insert a + `dummy_failure_jump' before the initial + `on_failure_jump' instruction of the loop. This + effects a skip over that instruction the first time + we hit that loop. */ + GET_BUFFER_SPACE(3); + INSERT_JUMP(dummy_failure_jump, laststart, + laststart + 6); + b += 3; + } + } + break; + + + case '.': + laststart = b; + BUF_PUSH(anychar); + break; + + + case '[': + { + boolean had_char_class = false; + + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + /* Ensure that we have enough space to push a charset: the + opcode, the length count, and the bitset; 34 bytes in all. */ + GET_BUFFER_SPACE(34); + + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH(*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* Push the number of bytes in the bitmap. */ + BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH); + + /* Clear the whole map. */ + bzero(b, (1 << BYTEWIDTH) / BYTEWIDTH); + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-2] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT('\n'); + + /* Read in characters and ranges, setting map bits. */ + for (;;) { + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + PATFETCH(c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') { + if (p == pend) + FREE_STACK_RETURN(REG_EESCAPE); + + PATFETCH(c1); + SET_LIST_BIT(c1); + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + FREE_STACK_RETURN(REG_ERANGE); + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') { + reg_errcode_t ret + = compile_range(&p, pend, translate, syntax, b); + + if (ret != REG_NOERROR) + FREE_STACK_RETURN(ret); + } + + else if (p[0] == '-' && p[1] != ']') { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH(c1); + + ret = compile_range(&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) + FREE_STACK_RETURN(ret); + } + + /* See if we're at the beginning of a possible character + class. */ + + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH(c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + for (;;) { + PATFETCH(c); + if ((c == ':' && *p == ']') || p == pend) + break; + if (c1 < CHAR_CLASS_MAX_LENGTH) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and `:]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but set bits for them). */ + if (c == ':' && *p == ']') { +#if defined _LIBC || WIDE_CHAR_SUPPORT + boolean is_lower = STREQ(str, "lower"); + boolean is_upper = STREQ(str, "upper"); + wctype_t wt; + int ch; + + wt = IS_CHAR_CLASS(str); + if (wt == 0) + FREE_STACK_RETURN(REG_ECTYPE); + + /* Throw away the ] at the end of the character + class. */ + PATFETCH(c); + + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) { +# ifdef _LIBC + if (__iswctype(__btowc(ch), wt)) + SET_LIST_BIT(ch); +# else + if (iswctype(btowc(ch), wt)) + SET_LIST_BIT(ch); +# endif + + if (translate && (is_upper || is_lower) + && (ISUPPER(ch) || ISLOWER(ch))) + SET_LIST_BIT(ch); + } + + had_char_class = true; +#else + int ch; + boolean is_alnum = STREQ(str, "alnum"); + boolean is_alpha = STREQ(str, "alpha"); + boolean is_blank = STREQ(str, "blank"); + boolean is_cntrl = STREQ(str, "cntrl"); + boolean is_digit = STREQ(str, "digit"); + boolean is_graph = STREQ(str, "graph"); + boolean is_lower = STREQ(str, "lower"); + boolean is_print = STREQ(str, "print"); + boolean is_punct = STREQ(str, "punct"); + boolean is_space = STREQ(str, "space"); + boolean is_upper = STREQ(str, "upper"); + boolean is_xdigit = STREQ(str, "xdigit"); + + if (!IS_CHAR_CLASS(str)) + FREE_STACK_RETURN(REG_ECTYPE); + + /* Throw away the ] at the end of the character + class. */ + PATFETCH(c); + + if (p == pend) + FREE_STACK_RETURN(REG_EBRACK); + + for (ch = 0; ch < 1 << BYTEWIDTH; ch++) { + /* This was split into 3 if's to + avoid an arbitrary limit in some compiler. */ + if ((is_alnum && ISALNUM(ch)) + || (is_alpha && ISALPHA(ch)) + || (is_blank && ISBLANK(ch)) + || (is_cntrl && ISCNTRL(ch))) + SET_LIST_BIT(ch); + if ((is_digit && ISDIGIT(ch)) + || (is_graph && ISGRAPH(ch)) + || (is_lower && ISLOWER(ch)) + || (is_print && ISPRINT(ch))) + SET_LIST_BIT(ch); + if ((is_punct && ISPUNCT(ch)) + || (is_space && ISSPACE(ch)) + || (is_upper && ISUPPER(ch)) + || (is_xdigit && ISXDIGIT(ch))) + SET_LIST_BIT(ch); + if (translate && (is_upper || is_lower) + && (ISUPPER(ch) || ISLOWER(ch))) + SET_LIST_BIT(ch); + } + had_char_class = true; +#endif /* libc || wctype.h */ + } else { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT('['); + SET_LIST_BIT(':'); + had_char_class = false; + } + } else { + had_char_class = false; + SET_LIST_BIT(c); + } + } + + /* Discard any (non)matching list bytes that are all 0 at the + end of the map. Decrease the map-length byte too. */ + while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) + b[-1]--; + b += b[-1]; + } + break; + + + case '(': + if (syntax & RE_NO_BK_PARENS) + goto handle_open; + else + goto normal_char; + + + case ')': + if (syntax & RE_NO_BK_PARENS) + goto handle_close; + else + goto normal_char; + + + case '\n': + if (syntax & RE_NEWLINE_ALT) + goto handle_alt; + else + goto normal_char; + + + case '|': + if (syntax & RE_NO_BK_VBAR) + goto handle_alt; + else + goto normal_char; + + + case '{': + if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) + goto handle_interval; + else + goto normal_char; + + + case '\\': + if (p == pend) + FREE_STACK_RETURN(REG_EESCAPE); + + /* Do not translate the character after the \, so that we can + distinguish, e.g., \B from \b, even if we normally would + translate, e.g., B to b. */ + PATFETCH_RAW(c); + + switch (c) { + case '(': + if (syntax & RE_NO_BK_PARENS) + goto normal_backslash; + + handle_open: + bufp->re_nsub++; + regnum++; + + if (COMPILE_STACK_FULL) { + RETALLOC(compile_stack.stack, compile_stack.size << 1, + compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size <<= 1; + } + + /* These are the values to restore when we hit end of this + group. They are all relative offsets, so that if the + whole pattern moves because of realloc, they will still + be valid. */ + COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; + COMPILE_STACK_TOP.fixup_alt_jump + = + fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; + COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; + COMPILE_STACK_TOP.regnum = regnum; + + /* We will eventually replace the 0 with the number of + groups inner to this one. But do not push a + start_memory for groups beyond the last one we can + represent in the compiled pattern. */ + if (regnum <= MAX_REGNUM) { + COMPILE_STACK_TOP.inner_group_offset = + b - bufp->buffer + 2; + BUF_PUSH_3(start_memory, regnum, 0); + } + + compile_stack.avail++; + + fixup_alt_jump = 0; + laststart = 0; + begalt = b; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + break; + + + case ')': + if (syntax & RE_NO_BK_PARENS) + goto normal_backslash; + + if (COMPILE_STACK_EMPTY) { + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_backslash; + else + FREE_STACK_RETURN(REG_ERPAREN); + } + + handle_close: + if (fixup_alt_jump) { /* Push a dummy failure point at the end of the + alternative for a possible future + `pop_failure_jump' to pop. See comments at + `push_dummy_failure' in `re_match_2'. */ + BUF_PUSH(push_dummy_failure); + + /* We allocated space for this jump when we assigned + to `fixup_alt_jump', in the `handle_alt' case below. */ + STORE_JUMP(jump_past_alt, fixup_alt_jump, b - 1); + } + + /* See similar code for backslashed left paren above. */ + if (COMPILE_STACK_EMPTY) { + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + else + FREE_STACK_RETURN(REG_ERPAREN); + } + + /* Since we just checked for an empty stack above, this + ``can't happen''. */ + assert(compile_stack.avail != 0); + { + /* We don't just want to restore into `regnum', because + later groups should continue to be numbered higher, + as in `(ab)c(de)' -- the second group is #2. */ + regnum_t this_group_regnum; + + compile_stack.avail--; + begalt = + bufp->buffer + COMPILE_STACK_TOP.begalt_offset; + fixup_alt_jump = + COMPILE_STACK_TOP.fixup_alt_jump ? bufp->buffer + + COMPILE_STACK_TOP.fixup_alt_jump - 1 : 0; + laststart = + bufp->buffer + COMPILE_STACK_TOP.laststart_offset; + this_group_regnum = COMPILE_STACK_TOP.regnum; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + + /* We're at the end of the group, so now we know how many + groups were inside this one. */ + if (this_group_regnum <= MAX_REGNUM) { + unsigned char *inner_group_loc + + = + bufp->buffer + + COMPILE_STACK_TOP.inner_group_offset; + + *inner_group_loc = regnum - this_group_regnum; + BUF_PUSH_3(stop_memory, this_group_regnum, + regnum - this_group_regnum); + } + } + break; + + + case '|': /* `\|'. */ + if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) + goto normal_backslash; + handle_alt: + if (syntax & RE_LIMITED_OPS) + goto normal_char; + + /* Insert before the previous alternative a jump which + jumps to this alternative if the former fails. */ + GET_BUFFER_SPACE(3); + INSERT_JUMP(on_failure_jump, begalt, b + 6); + pending_exact = 0; + b += 3; + + /* The alternative before this one has a jump after it + which gets executed if it gets matched. Adjust that + jump so it will jump to this alternative's analogous + jump (put in below, which in turn will jump to the next + (if any) alternative's such jump, etc.). The last such + jump jumps to the correct final destination. A picture: + _____ _____ + | | | | + | v | v + a | b | c + + If we are at `b', then fixup_alt_jump right now points to a + three-byte space after `a'. We'll put in the jump, set + fixup_alt_jump to right after `b', and leave behind three + bytes which we'll fill in when we get to after `c'. */ + + if (fixup_alt_jump) + STORE_JUMP(jump_past_alt, fixup_alt_jump, b); + + /* Mark and leave space for a jump after this alternative, + to be filled in later either by next alternative or + when know we're at the end of a series of alternatives. */ + fixup_alt_jump = b; + GET_BUFFER_SPACE(3); + b += 3; + + laststart = 0; + begalt = b; + break; + + + case '{': + /* If \{ is a literal. */ + if (!(syntax & RE_INTERVALS) + /* If we're at `\{' and it's not the open-interval + operator. */ + || ((syntax & RE_INTERVALS) + && (syntax & RE_NO_BK_BRACES)) || (p - 2 == pattern + && p == pend)) + goto normal_backslash; + + handle_interval: + { + /* If got here, then the syntax allows intervals. */ + + /* At least (most) this many matches must be made. */ + int lower_bound = -1, upper_bound = -1; + + beg_interval = p - 1; + + if (p == pend) { + if (!(syntax & RE_INTERVALS) + && (syntax & RE_NO_BK_BRACES)) goto + unfetch_interval; + else + FREE_STACK_RETURN(REG_EBRACE); + } + + GET_UNSIGNED_NUMBER(lower_bound); + + if (c == ',') { + GET_UNSIGNED_NUMBER(upper_bound); + if ((!(syntax & RE_NO_BK_BRACES) && c != '\\') + || ((syntax & RE_NO_BK_BRACES) && c != '}')) + FREE_STACK_RETURN(REG_BADBR); + + if (upper_bound < 0) + upper_bound = RE_DUP_MAX; + } else + /* Interval such as `{1}' => match exactly once. */ + upper_bound = lower_bound; + + if (lower_bound < 0 || upper_bound > RE_DUP_MAX + || lower_bound > upper_bound) { + if (!(syntax & RE_INTERVALS) + && (syntax & RE_NO_BK_BRACES)) goto + unfetch_interval; + else + FREE_STACK_RETURN(REG_BADBR); + } + + if (!(syntax & RE_NO_BK_BRACES)) { + if (c != '\\') + FREE_STACK_RETURN(REG_EBRACE); + + PATFETCH(c); + } + + if (c != '}') { + if (!(syntax & RE_INTERVALS) + && (syntax & RE_NO_BK_BRACES)) goto + unfetch_interval; + else + FREE_STACK_RETURN(REG_BADBR); + } + + /* We just parsed a valid interval. */ + + /* If it's invalid to have no preceding re. */ + if (!laststart) { + if (syntax & RE_CONTEXT_INVALID_OPS) + FREE_STACK_RETURN(REG_BADRPT); + else if (syntax & RE_CONTEXT_INDEP_OPS) + laststart = b; + else + goto unfetch_interval; + } + + /* If the upper bound is zero, don't want to succeed at + all; jump from `laststart' to `b + 3', which will be + the end of the buffer after we insert the jump. */ + if (upper_bound == 0) { + GET_BUFFER_SPACE(3); + INSERT_JUMP(jump, laststart, b + 3); + b += 3; + } + + /* Otherwise, we have a nontrivial interval. When + we're all done, the pattern will look like: + set_number_at <jump count> <upper bound> + set_number_at <succeed_n count> <lower bound> + succeed_n <after jump addr> <succeed_n count> + <body of loop> + jump_n <succeed_n addr> <jump count> + (The upper bound and `jump_n' are omitted if + `upper_bound' is 1, though.) */ + else { /* If the upper bound is > 1, we need to insert + more at the end of the loop. */ + unsigned nbytes = 10 + (upper_bound > 1) * 10; + + GET_BUFFER_SPACE(nbytes); + + /* Initialize lower bound of the `succeed_n', even + though it will be set during matching by its + attendant `set_number_at' (inserted next), + because `re_compile_fastmap' needs to know. + Jump to the `jump_n' we might insert below. */ + INSERT_JUMP2(succeed_n, laststart, + b + 5 + (upper_bound > 1) * 5, + lower_bound); + b += 5; + + /* Code to initialize the lower bound. Insert + before the `succeed_n'. The `5' is the last two + bytes of this `set_number_at', plus 3 bytes of + the following `succeed_n'. */ + insert_op2(set_number_at, laststart, 5, + lower_bound, b); + b += 5; + + if (upper_bound > 1) { /* More than one repetition is allowed, so + append a backward jump to the `succeed_n' + that starts this interval. + + When we've reached this during matching, + we'll have matched the interval once, so + jump back only `upper_bound - 1' times. */ + STORE_JUMP2(jump_n, b, laststart + 5, + upper_bound - 1); + b += 5; + + /* The location we want to set is the second + parameter of the `jump_n'; that is `b-2' as + an absolute address. `laststart' will be + the `set_number_at' we're about to insert; + `laststart+3' the number to set, the source + for the relative address. But we are + inserting into the middle of the pattern -- + so everything is getting moved up by 5. + Conclusion: (b - 2) - (laststart + 3) + 5, + i.e., b - laststart. + + We insert this at the beginning of the loop + so that if we fail during matching, we'll + reinitialize the bounds. */ + insert_op2(set_number_at, laststart, + b - laststart, upper_bound - 1, b); + b += 5; + } + } + pending_exact = 0; + beg_interval = NULL; + } + break; + + unfetch_interval: + /* If an invalid interval, match the characters as literals. */ + assert(beg_interval); + p = beg_interval; + beg_interval = NULL; + + /* normal_char and normal_backslash need `c'. */ + PATFETCH(c); + + if (!(syntax & RE_NO_BK_BRACES)) { + if (p > pattern && p[-1] == '\\') + goto normal_backslash; + } + goto normal_char; + +#ifdef emacs + /* There is no way to specify the before_dot and after_dot + operators. rms says this is ok. --karl */ + case '=': + BUF_PUSH(at_dot); + break; + + case 's': + laststart = b; + PATFETCH(c); + BUF_PUSH_2(syntaxspec, syntax_spec_code[c]); + break; + + case 'S': + laststart = b; + PATFETCH(c); + BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]); + break; +#endif /* emacs */ + + + case 'w': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + laststart = b; + BUF_PUSH(wordchar); + break; + + + case 'W': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + laststart = b; + BUF_PUSH(notwordchar); + break; + + + case '<': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(wordbeg); + break; + + case '>': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(wordend); + break; + + case 'b': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(wordbound); + break; + + case 'B': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(notwordbound); + break; + + case '`': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(begbuf); + break; + + case '\'': + if (syntax & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH(endbuf); + break; + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (syntax & RE_NO_BK_REFS) + goto normal_char; + + c1 = c - '0'; + + if (c1 > regnum) + FREE_STACK_RETURN(REG_ESUBREG); + + /* Can't back reference to a subexpression if inside of it. */ + if (group_in_compile_stack(compile_stack, (regnum_t) c1)) + goto normal_char; + + laststart = b; + BUF_PUSH_2(duplicate, c1); + break; + + + case '+': + case '?': + if (syntax & RE_BK_PLUS_QM) + goto handle_plus; + else + goto normal_backslash; + + default: + normal_backslash: + /* You might think it would be useful for \ to mean + not to translate; but if we don't translate it + it will never match anything. */ + c = TRANSLATE(c); + goto normal_char; + } + break; + + + default: + /* Expects the character in `c'. */ + normal_char: + /* If no exactn currently being built. */ + if (!pending_exact + /* If last exactn not at current position. */ + || pending_exact + *pending_exact + 1 != b + /* We have only one byte following the exactn for the count. */ + || *pending_exact == (1 << BYTEWIDTH) - 1 + /* If followed by a repetition operator. */ + || *p == '*' || *p == '^' || ((syntax & RE_BK_PLUS_QM) + ? *p == '\\' && (p[1] == '+' + || p[1] == + '?') : (*p + == + '+' + || + *p + == + '?')) + || ((syntax & RE_INTERVALS) + && ((syntax & RE_NO_BK_BRACES) + ? *p == '{' : (p[0] == '\\' && p[1] == '{')))) { + /* Start building a new exactn. */ + + laststart = b; + + BUF_PUSH_2(exactn, 0); + pending_exact = b - 1; + } + + BUF_PUSH(c); + (*pending_exact)++; + break; + } /* switch (c) */ + } /* while p != pend */ + + + /* Through the pattern now. */ + + if (fixup_alt_jump) + STORE_JUMP(jump_past_alt, fixup_alt_jump, b); + + if (!COMPILE_STACK_EMPTY) + FREE_STACK_RETURN(REG_EPAREN); + + /* If we don't want backtracking, force success + the first time we reach the end of the compiled pattern. */ + if (syntax & RE_NO_POSIX_BACKTRACKING) + BUF_PUSH(succeed); + + free(compile_stack.stack); + + /* We have succeeded; set the length of the buffer. */ + bufp->used = b - bufp->buffer; + +#ifdef DEBUG + if (debug) { + DEBUG_PRINT1("\nCompiled pattern: \n"); + print_compiled_pattern(bufp); + } +#endif /* DEBUG */ + +#ifndef MATCH_MAY_ALLOCATE + /* Initialize the failure stack to the largest possible stack. This + isn't necessary unless we're trying to avoid calling alloca in + the search and match routines. */ + { + int num_regs = bufp->re_nsub + 1; + + /* Since DOUBLE_FAIL_STACK refuses to double only if the current size + is strictly greater than re_max_failures, the largest possible stack + is 2 * re_max_failures failure points. */ + if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) { + fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); + +# ifdef emacs + if (!fail_stack.stack) + fail_stack.stack + = (fail_stack_elt_t *) xmalloc(fail_stack.size + * + sizeof + (fail_stack_elt_t)); + else + fail_stack.stack = + (fail_stack_elt_t *) xrealloc(fail_stack.stack, + (fail_stack.size * + sizeof + (fail_stack_elt_t))); +# else /* not emacs */ + if (!fail_stack.stack) + fail_stack.stack + = (fail_stack_elt_t *) malloc(fail_stack.size + * + sizeof + (fail_stack_elt_t)); + else + fail_stack.stack = + (fail_stack_elt_t *) realloc(fail_stack.stack, + (fail_stack.size * + sizeof + (fail_stack_elt_t))); +# endif /* not emacs */ + } + + regex_grow_registers(num_regs); + } +#endif /* not MATCH_MAY_ALLOCATE */ + + return REG_NOERROR; +} /* regex_compile */ + +/* Subroutines for `regex_compile'. */ + +/* Store OP at LOC followed by two-byte integer parameter ARG. */ + +static void store_op1(op, loc, arg) +re_opcode_t op; +unsigned char *loc; +int arg; +{ + *loc = (unsigned char) op; + STORE_NUMBER(loc + 1, arg); +} + + +/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ + +static void store_op2(op, loc, arg1, arg2) +re_opcode_t op; +unsigned char *loc; +int arg1, arg2; +{ + *loc = (unsigned char) op; + STORE_NUMBER(loc + 1, arg1); + STORE_NUMBER(loc + 3, arg2); +} + + +/* Copy the bytes from LOC to END to open up three bytes of space at LOC + for OP followed by two-byte integer parameter ARG. */ + +static void insert_op1(op, loc, arg, end) +re_opcode_t op; +unsigned char *loc; +int arg; +unsigned char *end; +{ + register unsigned char *pfrom = end; + register unsigned char *pto = end + 3; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op1(op, loc, arg); +} + + +/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ + +static void insert_op2(op, loc, arg1, arg2, end) +re_opcode_t op; +unsigned char *loc; +int arg1, arg2; +unsigned char *end; +{ + register unsigned char *pfrom = end; + register unsigned char *pto = end + 5; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op2(op, loc, arg1, arg2); +} + + +/* P points to just after a ^ in PATTERN. Return true if that ^ comes + after an alternative or a begin-subexpression. We assume there is at + least one character before the ^. */ + +static boolean at_begline_loc_p(pattern, p, syntax) +const char *pattern, *p; +reg_syntax_t syntax; +{ + const char *prev = p - 2; + boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; + + return + /* After a subexpression? */ + (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) + /* After an alternative? */ + || (*prev == '|' + && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); +} + + +/* The dual of at_begline_loc_p. This one is for $. We assume there is + at least one character after the $, i.e., `P < PEND'. */ + +static boolean at_endline_loc_p(p, pend, syntax) +const char *p, *pend; +reg_syntax_t syntax; +{ + const char *next = p; + boolean next_backslash = *next == '\\'; + const char *next_next = p + 1 < pend ? p + 1 : 0; + + return + /* Before a subexpression? */ + (syntax & RE_NO_BK_PARENS ? *next == ')' + : next_backslash && next_next && *next_next == ')') + /* Before an alternative? */ + || (syntax & RE_NO_BK_VBAR ? *next == '|' + : next_backslash && next_next && *next_next == '|'); +} + + +/* Returns true if REGNUM is in one of COMPILE_STACK's elements and + false if it's not. */ + +static boolean group_in_compile_stack(compile_stack, regnum) +compile_stack_type compile_stack; +regnum_t regnum; +{ + int this_element; + + for (this_element = compile_stack.avail - 1; + this_element >= 0; this_element--) + if (compile_stack.stack[this_element].regnum == regnum) + return true; + + return false; +} + + +/* Read the ending character of a range (in a bracket expression) from the + uncompiled pattern *P_PTR (which ends at PEND). We assume the + starting character is in `P[-2]'. (`P[-1]' is the character `-'.) + Then we set the translation of all bits between the starting and + ending characters (inclusive) in the compiled pattern B. + + Return an error code. + + We use these short variable names so we can use the same macros as + `regex_compile' itself. */ + +static reg_errcode_t compile_range(p_ptr, pend, translate, syntax, b) +const char **p_ptr, *pend; +RE_TRANSLATE_TYPE translate; +reg_syntax_t syntax; +unsigned char *b; +{ + unsigned this_char; + + const char *p = *p_ptr; + reg_errcode_t ret; + char range_start[2]; + char range_end[2]; + char ch[2]; + + if (p == pend) + return REG_ERANGE; + + /* Fetch the endpoints without translating them; the + appropriate translation is done in the bit-setting loop below. */ + range_start[0] = p[-2]; + range_start[1] = '\0'; + range_end[0] = p[0]; + range_end[1] = '\0'; + + /* Have to increment the pointer into the pattern string, so the + caller isn't still at the ending character. */ + (*p_ptr)++; + + /* Report an error if the range is empty and the syntax prohibits this. */ + ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; + + /* Here we see why `this_char' has to be larger than an `unsigned + char' -- we would otherwise go into an infinite loop, since all + characters <= 0xff. */ + ch[1] = '\0'; + for (this_char = 0; this_char <= (unsigned char) -1; ++this_char) { + ch[0] = this_char; + if (strcoll(range_start, ch) <= 0 && strcoll(ch, range_end) <= 0) { + SET_LIST_BIT(TRANSLATE(this_char)); + ret = REG_NOERROR; + } + } + + return ret; +} + +/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in + BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible + characters can start a string that matches the pattern. This fastmap + is used by re_search to skip quickly over impossible starting points. + + The caller must supply the address of a (1 << BYTEWIDTH)-byte data + area as BUFP->fastmap. + + We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in + the pattern buffer. + + Returns 0 if we succeed, -2 if an internal error. */ + +int re_compile_fastmap(bufp) +struct re_pattern_buffer *bufp; +{ + int j, k; + +#ifdef MATCH_MAY_ALLOCATE + fail_stack_type fail_stack; +#endif +#ifndef REGEX_MALLOC + char *destination; +#endif + + register char *fastmap = bufp->fastmap; + unsigned char *pattern = bufp->buffer; + unsigned char *p = pattern; + register unsigned char *pend = pattern + bufp->used; + +#ifdef REL_ALLOC + /* This holds the pointer to the failure stack, when + it is allocated relocatably. */ + fail_stack_elt_t *failure_stack_ptr; +#endif + + /* Assume that each path through the pattern can be null until + proven otherwise. We set this false at the bottom of switch + statement, to which we get only if a particular path doesn't + match the empty string. */ + boolean path_can_be_null = true; + + /* We aren't doing a `succeed_n' to begin with. */ + boolean succeed_n_p = false; + + assert(fastmap != NULL && p != NULL); + + INIT_FAIL_STACK(); + bzero(fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ + bufp->fastmap_accurate = 1; /* It will be when we're done. */ + bufp->can_be_null = 0; + + while (1) { + if (p == pend || *p == succeed) { + /* We have reached the (effective) end of pattern. */ + if (!FAIL_STACK_EMPTY()) { + bufp->can_be_null |= path_can_be_null; + + /* Reset for next path. */ + path_can_be_null = true; + + p = fail_stack.stack[--fail_stack.avail].pointer; + + continue; + } else + break; + } + + /* We should never be about to go beyond the end of the pattern. */ + assert(p < pend); + + switch (SWITCH_ENUM_CAST((re_opcode_t) * p++)) { + + /* I guess the idea here is to simply not bother with a fastmap + if a backreference is used, since it's too hard to figure out + the fastmap for the corresponding group. Setting + `can_be_null' stops `re_search_2' from using the fastmap, so + that is all we do. */ + case duplicate: + bufp->can_be_null = 1; + goto done; + + + /* Following are the cases which match a character. These end + with `break'. */ + + case exactn: + fastmap[p[1]] = 1; + break; + + + case charset: + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) + fastmap[j] = 1; + break; + + + case charset_not: + /* Chars beyond end of map must be allowed. */ + for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + fastmap[j] = 1; + break; + + + case wordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX(j) == Sword) + fastmap[j] = 1; + break; + + + case notwordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX(j) != Sword) + fastmap[j] = 1; + break; + + + case anychar: + { + int fastmap_newline = fastmap['\n']; + + /* `.' matches anything ... */ + for (j = 0; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + /* ... except perhaps newline. */ + if (!(bufp->syntax & RE_DOT_NEWLINE)) + fastmap['\n'] = fastmap_newline; + + /* Return if we have already set `can_be_null'; if we have, + then the fastmap is irrelevant. Something's wrong here. */ + else if (bufp->can_be_null) + goto done; + + /* Otherwise, have to check alternative paths. */ + break; + } + +#ifdef emacs + case syntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX(j) == (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + case notsyntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX(j) != (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + /* All cases after this match the empty string. These end with + `continue'. */ + + + case before_dot: + case at_dot: + case after_dot: + continue; +#endif /* emacs */ + + + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbound: + case notwordbound: + case wordbeg: + case wordend: + case push_dummy_failure: + continue; + + + case jump_n: + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case jump_past_alt: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR(j, p); + p += j; + if (j > 0) + continue; + + /* Jump backward implies we just went through the body of a + loop and matched nothing. Opcode jumped to should be + `on_failure_jump' or `succeed_n'. Just treat it like an + ordinary jump. For a * loop, it has pushed its failure + point already; if so, discard that as redundant. */ + if ((re_opcode_t) * p != on_failure_jump + && (re_opcode_t) * p != succeed_n) + continue; + + p++; + EXTRACT_NUMBER_AND_INCR(j, p); + p += j; + + /* If what's on the stack is where we are now, pop it. */ + if (!FAIL_STACK_EMPTY() + && fail_stack.stack[fail_stack.avail - 1].pointer == p) + fail_stack.avail--; + + continue; + + + case on_failure_jump: + case on_failure_keep_string_jump: + handle_on_failure_jump: + EXTRACT_NUMBER_AND_INCR(j, p); + + /* For some patterns, e.g., `(a?)?', `p+j' here points to the + end of the pattern. We don't want to push such a point, + since when we restore it above, entering the switch will + increment `p' past the end of the pattern. We don't need + to push such a point since we obviously won't find any more + fastmap entries beyond `pend'. Such a pattern can match + the null string, though. */ + if (p + j < pend) { + if (!PUSH_PATTERN_OP(p + j, fail_stack)) { + RESET_FAIL_STACK(); + return -2; + } + } else + bufp->can_be_null = 1; + + if (succeed_n_p) { + EXTRACT_NUMBER_AND_INCR(k, p); /* Skip the n. */ + succeed_n_p = false; + } + + continue; + + + case succeed_n: + /* Get to the number of times to succeed. */ + p += 2; + + /* Increment p past the n for when k != 0. */ + EXTRACT_NUMBER_AND_INCR(k, p); + if (k == 0) { + p -= 4; + succeed_n_p = true; /* Spaghetti code alert. */ + goto handle_on_failure_jump; + } + continue; + + + case set_number_at: + p += 4; + continue; + + + case start_memory: + case stop_memory: + p += 2; + continue; + + + default: + abort(); /* We have listed all the cases. */ + } /* switch *p++ */ + + /* Getting here means we have found the possible starting + characters for one path of the pattern -- and that the empty + string does not match. We need not follow this path further. + Instead, look at the next alternative (remembered on the + stack), or quit if no more. The test at the top of the loop + does these things. */ + path_can_be_null = false; + p = pend; + } /* while p */ + + /* Set `can_be_null' for the last path (also the first path, if the + pattern is empty). */ + bufp->can_be_null |= path_can_be_null; + + done: + RESET_FAIL_STACK(); + return 0; +} /* re_compile_fastmap */ + +#ifdef _LIBC +weak_alias(__re_compile_fastmap, re_compile_fastmap) +#endif +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use + this memory for recording register information. STARTS and ENDS + must be allocated using the malloc library routine, and must each + be at least NUM_REGS * sizeof (regoff_t) bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ +void re_set_registers(bufp, regs, num_regs, starts, ends) +struct re_pattern_buffer *bufp; +struct re_registers *regs; +unsigned num_regs; +regoff_t *starts, *ends; +{ + if (num_regs) { + bufp->regs_allocated = REGS_REALLOCATE; + regs->num_regs = num_regs; + regs->start = starts; + regs->end = ends; + } else { + bufp->regs_allocated = REGS_UNALLOCATED; + regs->num_regs = 0; + regs->start = regs->end = (regoff_t *) 0; + } +} + +#ifdef _LIBC +weak_alias(__re_set_registers, re_set_registers) +#endif +/* Searching routines. */ +/* Like re_search_2, below, but only one string is specified, and + doesn't let you say where to stop matching. */ +int re_search(bufp, string, size, startpos, range, regs) +struct re_pattern_buffer *bufp; +const char *string; +int size, startpos, range; +struct re_registers *regs; +{ + return re_search_2(bufp, NULL, 0, string, size, startpos, range, + regs, size); +} + +#ifdef _LIBC +weak_alias(__re_search, re_search) +#endif +/* Using the compiled pattern in BUFP->buffer, first tries to match the + virtual concatenation of STRING1 and STRING2, starting first at index + STARTPOS, then at STARTPOS + 1, and so on. + + STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. + + RANGE is how far to scan while trying to match. RANGE = 0 means try + only at STARTPOS; in general, the last start tried is STARTPOS + + RANGE. + + In REGS, return the indices of the virtual concatenation of STRING1 + and STRING2 that matched the entire BUFP->buffer and its contained + subexpressions. + + Do not consider matching one past the index STOP in the virtual + concatenation of STRING1 and STRING2. + + We return either the position in the strings at which the match was + found, -1 if no match, or -2 if error (such as failure + stack overflow). */ +int +re_search_2(bufp, string1, size1, string2, size2, startpos, range, regs, + stop) +struct re_pattern_buffer *bufp; +const char *string1, *string2; +int size1, size2; +int startpos; +int range; +struct re_registers *regs; +int stop; +{ + int val; + register char *fastmap = bufp->fastmap; + register RE_TRANSLATE_TYPE translate = bufp->translate; + int total_size = size1 + size2; + int endpos = startpos + range; + + /* Check for out-of-range STARTPOS. */ + if (startpos < 0 || startpos > total_size) + return -1; + + /* Fix up RANGE if it might eventually take us outside + the virtual concatenation of STRING1 and STRING2. + Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */ + if (endpos < 0) + range = 0 - startpos; + else if (endpos > total_size) + range = total_size - startpos; + + /* If the search isn't to be a backwards one, don't waste time in a + search for a pattern that must be anchored. */ + if (bufp->used > 0 && range > 0 + && ((re_opcode_t) bufp->buffer[0] == begbuf + /* `begline' is like `begbuf' if it cannot match at newlines. */ + || ((re_opcode_t) bufp->buffer[0] == begline + && !bufp->newline_anchor))) { + if (startpos > 0) + return -1; + else + range = 1; + } +#ifdef emacs + /* In a forward search for something that starts with \=. + don't keep searching past point. */ + if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot + && range > 0) { + range = PT - startpos; + if (range <= 0) + return -1; + } +#endif /* emacs */ + + /* Update the fastmap now if not correct already. */ + if (fastmap && !bufp->fastmap_accurate) + if (re_compile_fastmap(bufp) == -2) + return -2; + + /* Loop through the string, looking for a place to start matching. */ + for (;;) { + /* If a fastmap is supplied, skip quickly over characters that + cannot be the start of a match. If the pattern can match the + null string, however, we don't need to skip characters; we want + the first null string. */ + if (fastmap && startpos < total_size && !bufp->can_be_null) { + if (range > 0) { /* Searching forwards. */ + register const char *d; + register int lim = 0; + int irange = range; + + if (startpos < size1 && startpos + range >= size1) + lim = range - (size1 - startpos); + + d = + (startpos >= + size1 ? string2 - size1 : string1) + startpos; + + /* Written out as an if-else to avoid testing `translate' + inside the loop. */ + if (translate) + while (range > lim && !fastmap[(unsigned char) + translate[ + (unsigned + char) *d++]]) + range--; + else + while (range > lim && !fastmap[(unsigned char) *d++]) + range--; + + startpos += irange - range; + } else { /* Searching backwards. */ + + register char c = (size1 == 0 || startpos >= size1 + ? string2[startpos - size1] + : string1[startpos]); + + if (!fastmap[(unsigned char) TRANSLATE(c)]) + goto advance; + } + } + + /* If can't match the null string, and that's all we have left, fail. */ + if (range >= 0 && startpos == total_size && fastmap + && !bufp->can_be_null) return -1; + + val = re_match_2_internal(bufp, string1, size1, string2, size2, + startpos, regs, stop); +#ifndef REGEX_MALLOC +# ifdef C_ALLOCA + alloca(0); +# endif +#endif + + if (val >= 0) + return startpos; + + if (val == -2) + return -2; + + advance: + if (!range) + break; + else if (range > 0) { + range--; + startpos++; + } else { + range++; + startpos--; + } + } + return -1; +} /* re_search_2 */ + +#ifdef _LIBC +weak_alias(__re_search_2, re_search_2) +#endif +/* This converts PTR, a pointer into one of the search strings `string1' + and `string2' into an offset from the beginning of that string. */ +#define POINTER_TO_OFFSET(ptr) \ + (FIRST_STRING_P (ptr) \ + ? ((regoff_t) ((ptr) - string1)) \ + : ((regoff_t) ((ptr) - string2 + size1))) +/* Macros for dealing with the split strings in re_match_2. */ +#define MATCHING_IN_FIRST_STRING (dend == end_match_1) +/* Call before fetching a character with *d. This switches over to + string2 if necessary. */ +#define PREFETCH() \ + while (d == dend) \ + { \ + /* End of string2 => fail. */ \ + if (dend == end_match_2) \ + goto fail; \ + /* End of string1 => advance to string2. */ \ + d = string2; \ + dend = end_match_2; \ + } +/* Test if at very beginning or at very end of the virtual concatenation + of `string1' and `string2'. If only one string, it's `string2'. */ +#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) +#define AT_STRINGS_END(d) ((d) == end2) +/* Test if D points to a character which is word-constituent. We have + two special cases to check for: if past the end of string1, look at + the first character in string2; and if before the beginning of + string2, look at the last character in string1. */ +#define WORDCHAR_P(d) \ + (SYNTAX ((d) == end1 ? *string2 \ + : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ + == Sword) +/* Disabled due to a compiler bug -- see comment at case wordbound */ +#if 0 +/* Test if the character before D and the one at D differ with respect + to being word-constituent. */ +#define AT_WORD_BOUNDARY(d) \ + (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ + || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) +#endif +/* Free everything we malloc. */ +#ifdef MATCH_MAY_ALLOCATE +# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL +# define FREE_VARIABLES() \ + do { \ + REGEX_FREE_STACK (fail_stack.stack); \ + FREE_VAR (regstart); \ + FREE_VAR (regend); \ + FREE_VAR (old_regstart); \ + FREE_VAR (old_regend); \ + FREE_VAR (best_regstart); \ + FREE_VAR (best_regend); \ + FREE_VAR (reg_info); \ + FREE_VAR (reg_dummy); \ + FREE_VAR (reg_info_dummy); \ + } while (0) +#else +# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ +#endif /* not MATCH_MAY_ALLOCATE */ +/* These values must meet several constraints. They must not be valid + register values; since we have a limit of 255 registers (because + we use only one byte in the pattern for the register number), we can + use numbers larger than 255. They must differ by 1, because of + NUM_FAILURE_ITEMS above. And the value for the lowest register must + be larger than the value for the highest register, so we do not try + to actually save any registers when none are active. */ +#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) +#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) +/* Matching routines. */ +#ifndef emacs /* Emacs never uses this. */ +/* re_match is like re_match_2 except it takes only a single string. */ +int re_match(bufp, string, size, pos, regs) +struct re_pattern_buffer *bufp; +const char *string; +int size, pos; +struct re_registers *regs; +{ + int result = re_match_2_internal(bufp, NULL, 0, string, size, + pos, regs, size); + +# ifndef REGEX_MALLOC +# ifdef C_ALLOCA + alloca(0); +# endif +# endif + return result; +} + +# ifdef _LIBC +weak_alias(__re_match, re_match) +# endif +#endif /* not emacs */ +static boolean group_match_null_string_p _RE_ARGS((unsigned char **p, + unsigned char *end, + register_info_type * + + reg_info)); +static boolean alt_match_null_string_p +_RE_ARGS( + + (unsigned char *p, unsigned char *end, + register_info_type * reg_info)); +static boolean common_op_match_null_string_p +_RE_ARGS( + + (unsigned char **p, unsigned char *end, + register_info_type * reg_info)); +static int bcmp_translate +_RE_ARGS((const char *s1, const char *s2, int len, char *translate)); + +/* re_match_2 matches the compiled pattern in BUFP against the + the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 + and SIZE2, respectively). We start matching at POS, and stop + matching at STOP. + + If REGS is non-null and the `no_sub' field of BUFP is nonzero, we + store offsets for the substring each group matched in REGS. See the + documentation for exactly how many groups we fill. + + We return -1 if no match, -2 if an internal error (such as the + failure stack overflowing). Otherwise, we return the length of the + matched substring. */ + +int re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) +struct re_pattern_buffer *bufp; +const char *string1, *string2; +int size1, size2; +int pos; +struct re_registers *regs; +int stop; +{ + int result = re_match_2_internal(bufp, string1, size1, string2, size2, + pos, regs, stop); + +#ifndef REGEX_MALLOC +# ifdef C_ALLOCA + alloca(0); +# endif +#endif + return result; +} + +#ifdef _LIBC +weak_alias(__re_match_2, re_match_2) +#endif +/* This is a separate function so that we can force an alloca cleanup + afterwards. */ +static int +re_match_2_internal(bufp, string1, size1, string2, size2, pos, regs, stop) +struct re_pattern_buffer *bufp; +const char *string1, *string2; +int size1, size2; +int pos; +struct re_registers *regs; +int stop; +{ + /* General temporaries. */ + int mcnt; + unsigned char *p1; + + /* Just past the end of the corresponding string. */ + const char *end1, *end2; + + /* Pointers into string1 and string2, just past the last characters in + each to consider matching. */ + const char *end_match_1, *end_match_2; + + /* Where we are in the data, and the end of the current string. */ + const char *d, *dend; + + /* Where we are in the pattern, and the end of the pattern. */ + unsigned char *p = bufp->buffer; + register unsigned char *pend = p + bufp->used; + + /* Mark the opcode just after a start_memory, so we can test for an + empty subpattern when we get to the stop_memory. */ + unsigned char *just_past_start_mem = 0; + + /* We use this to map every character in the string. */ + RE_TRANSLATE_TYPE translate = bufp->translate; + + /* Failure point stack. Each place that can handle a failure further + down the line pushes a failure point on this stack. It consists of + restart, regend, and reg_info for all registers corresponding to + the subexpressions we're currently inside, plus the number of such + registers, and, finally, two char *'s. The first char * is where + to resume scanning the pattern; the second one is where to resume + scanning the strings. If the latter is zero, the failure point is + a ``dummy''; if a failure happens and the failure point is a dummy, + it gets discarded and the next next one is tried. */ +#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ + fail_stack_type fail_stack; +#endif +#ifdef DEBUG + static unsigned failure_id; + unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; +#endif + +#ifdef REL_ALLOC + /* This holds the pointer to the failure stack, when + it is allocated relocatably. */ + fail_stack_elt_t *failure_stack_ptr; +#endif + + /* We fill all the registers internally, independent of what we + return, for use in backreferences. The number here includes + an element for register zero. */ + size_t num_regs = bufp->re_nsub + 1; + + /* The currently active registers. */ + active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG; + active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG; + + /* Information on the contents of registers. These are pointers into + the input strings; they record just what was matched (on this + attempt) by a subexpression part of the pattern, that is, the + regnum-th regstart pointer points to where in the pattern we began + matching and the regnum-th regend points to right after where we + stopped matching the regnum-th subexpression. (The zeroth register + keeps track of what the whole pattern matches.) */ +#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ + const char **regstart, **regend; +#endif + + /* If a group that's operated upon by a repetition operator fails to + match anything, then the register for its start will need to be + restored because it will have been set to wherever in the string we + are when we last see its open-group operator. Similarly for a + register's end. */ +#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ + const char **old_regstart, **old_regend; +#endif + + /* The is_active field of reg_info helps us keep track of which (possibly + nested) subexpressions we are currently in. The matched_something + field of reg_info[reg_num] helps us tell whether or not we have + matched any of the pattern so far this time through the reg_num-th + subexpression. These two fields get reset each time through any + loop their register is in. */ +#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ + register_info_type *reg_info; +#endif + + /* The following record the register info as found in the above + variables when we find a match better than any we've seen before. + This happens as we backtrack through the failure points, which in + turn happens only if we have not yet matched the entire string. */ + unsigned best_regs_set = false; + +#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ + const char **best_regstart, **best_regend; +#endif + + /* Logically, this is `best_regend[0]'. But we don't want to have to + allocate space for that if we're not allocating space for anything + else (see below). Also, we never need info about register 0 for + any of the other register vectors, and it seems rather a kludge to + treat `best_regend' differently than the rest. So we keep track of + the end of the best match so far in a separate variable. We + initialize this to NULL so that when we backtrack the first time + and need to test it, it's not garbage. */ + const char *match_end = NULL; + + /* This helps SET_REGS_MATCHED avoid doing redundant work. */ + int set_regs_matched_done = 0; + + /* Used when we pop values we don't care about. */ +#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ + const char **reg_dummy; + register_info_type *reg_info_dummy; +#endif + +#ifdef DEBUG + /* Counts the total number of registers pushed. */ + unsigned num_regs_pushed = 0; +#endif + + DEBUG_PRINT1("\n\nEntering re_match_2.\n"); + + INIT_FAIL_STACK(); + +#ifdef MATCH_MAY_ALLOCATE + /* Do not bother to initialize all the register variables if there are + no groups in the pattern, as it takes a fair amount of time. If + there are groups, we include space for register 0 (the whole + pattern), even though we never use it, since it simplifies the + array indexing. We should fix this. */ + if (bufp->re_nsub) { + regstart = REGEX_TALLOC(num_regs, const char *); + regend = REGEX_TALLOC(num_regs, const char *); + old_regstart = REGEX_TALLOC(num_regs, const char *); + old_regend = REGEX_TALLOC(num_regs, const char *); + best_regstart = REGEX_TALLOC(num_regs, const char *); + best_regend = REGEX_TALLOC(num_regs, const char *); + + reg_info = REGEX_TALLOC(num_regs, register_info_type); + reg_dummy = REGEX_TALLOC(num_regs, const char *); + + reg_info_dummy = REGEX_TALLOC(num_regs, register_info_type); + + if (!(regstart && regend && old_regstart && old_regend && reg_info + && best_regstart && best_regend && reg_dummy + && reg_info_dummy)) { + FREE_VARIABLES(); + return -2; + } + } else { + /* We must initialize all our variables to NULL, so that + `FREE_VARIABLES' doesn't try to free them. */ + regstart = regend = old_regstart = old_regend = best_regstart + = best_regend = reg_dummy = NULL; + reg_info = reg_info_dummy = (register_info_type *) NULL; + } +#endif /* MATCH_MAY_ALLOCATE */ + + /* The starting position is bogus. */ + if (pos < 0 || pos > size1 + size2) { + FREE_VARIABLES(); + return -1; + } + + /* Initialize subexpression text positions to -1 to mark ones that no + start_memory/stop_memory has been seen for. Also initialize the + register information struct. */ + for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) { + regstart[mcnt] = regend[mcnt] + = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; + + REG_MATCH_NULL_STRING_P(reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; + IS_ACTIVE(reg_info[mcnt]) = 0; + MATCHED_SOMETHING(reg_info[mcnt]) = 0; + EVER_MATCHED_SOMETHING(reg_info[mcnt]) = 0; + } + + /* We move `string1' into `string2' if the latter's empty -- but not if + `string1' is null. */ + if (size2 == 0 && string1 != NULL) { + string2 = string1; + size2 = size1; + string1 = 0; + size1 = 0; + } + end1 = string1 + size1; + end2 = string2 + size2; + + /* Compute where to stop matching, within the two strings. */ + if (stop <= size1) { + end_match_1 = string1 + stop; + end_match_2 = string2; + } else { + end_match_1 = end1; + end_match_2 = string2 + stop - size1; + } + + /* `p' scans through the pattern as `d' scans through the data. + `dend' is the end of the input string that `d' points within. `d' + is advanced into the following input string whenever necessary, but + this happens before fetching; therefore, at the beginning of the + loop, `d' can be pointing at the end of a string, but it cannot + equal `string2'. */ + if (size1 > 0 && pos <= size1) { + d = string1 + pos; + dend = end_match_1; + } else { + d = string2 + pos - size1; + dend = end_match_2; + } + + DEBUG_PRINT1("The compiled pattern is:\n"); + DEBUG_PRINT_COMPILED_PATTERN(bufp, p, pend); + DEBUG_PRINT1("The string to match is: `"); + DEBUG_PRINT_DOUBLE_STRING(d, string1, size1, string2, size2); + DEBUG_PRINT1("'\n"); + + /* This loops over pattern commands. It exits by returning from the + function if the match is complete, or it drops through if the match + fails at this starting point in the input data. */ + for (;;) { +#ifdef _LIBC + DEBUG_PRINT2("\n%p: ", p); +#else + DEBUG_PRINT2("\n0x%x: ", p); +#endif + + if (p == pend) { /* End of pattern means we might have succeeded. */ + DEBUG_PRINT1("end of pattern ... "); + + /* If we haven't matched the entire string, and we want the + longest match, try backtracking. */ + if (d != end_match_2) { + /* 1 if this match ends in the same string (string1 or string2) + as the best previous match. */ + boolean same_str_p = (FIRST_STRING_P(match_end) + == MATCHING_IN_FIRST_STRING); + + /* 1 if this match is the best seen so far. */ + boolean best_match_p; + + /* AIX compiler got confused when this was combined + with the previous declaration. */ + if (same_str_p) + best_match_p = d > match_end; + else + best_match_p = !MATCHING_IN_FIRST_STRING; + + DEBUG_PRINT1("backtracking.\n"); + + if (!FAIL_STACK_EMPTY()) { /* More failure points to try. */ + + /* If exceeds best match so far, save it. */ + if (!best_regs_set || best_match_p) { + best_regs_set = true; + match_end = d; + + DEBUG_PRINT1("\nSAVING match as best so far.\n"); + + for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) { + best_regstart[mcnt] = regstart[mcnt]; + best_regend[mcnt] = regend[mcnt]; + } + } + goto fail; + } + + /* If no failure points, don't restore garbage. And if + last match is real best match, don't restore second + best one. */ + else if (best_regs_set && !best_match_p) { + restore_best_regs: + /* Restore best match. It may happen that `dend == + end_match_1' while the restored d is in string2. + For example, the pattern `x.*y.*z' against the + strings `x-' and `y-z-', if the two strings are + not consecutive in memory. */ + DEBUG_PRINT1("Restoring best registers.\n"); + + d = match_end; + dend = ((d >= string1 && d <= end1) + ? end_match_1 : end_match_2); + + for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) { + regstart[mcnt] = best_regstart[mcnt]; + regend[mcnt] = best_regend[mcnt]; + } + } + } + /* d != end_match_2 */ + succeed_label: + DEBUG_PRINT1("Accepting match.\n"); + + /* If caller wants register contents data back, do it. */ + if (regs && !bufp->no_sub) { + /* Have the register data arrays been allocated? */ + if (bufp->regs_allocated == REGS_UNALLOCATED) { /* No. So allocate them with malloc. We need one + extra element beyond `num_regs' for the `-1' marker + GNU code uses. */ + regs->num_regs = MAX(RE_NREGS, num_regs + 1); + regs->start = TALLOC(regs->num_regs, regoff_t); + regs->end = TALLOC(regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) { + FREE_VARIABLES(); + return -2; + } + bufp->regs_allocated = REGS_REALLOCATE; + } else if (bufp->regs_allocated == REGS_REALLOCATE) { /* Yes. If we need more elements than were already + allocated, reallocate them. If we need fewer, just + leave it alone. */ + if (regs->num_regs < num_regs + 1) { + regs->num_regs = num_regs + 1; + RETALLOC(regs->start, regs->num_regs, regoff_t); + RETALLOC(regs->end, regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) { + FREE_VARIABLES(); + return -2; + } + } + } else { + /* These braces fend off a "empty body in an else-statement" + warning under GCC when assert expands to nothing. */ + assert(bufp->regs_allocated == REGS_FIXED); + } + + /* Convert the pointer data in `regstart' and `regend' to + indices. Register zero has to be set differently, + since we haven't kept track of any info for it. */ + if (regs->num_regs > 0) { + regs->start[0] = pos; + regs->end[0] = (MATCHING_IN_FIRST_STRING + ? ((regoff_t) (d - string1)) + : ((regoff_t) (d - string2 + size1))); + } + + /* Go through the first `min (num_regs, regs->num_regs)' + registers, since that is all we initialized. */ + for (mcnt = 1; + (unsigned) mcnt < MIN(num_regs, regs->num_regs); + mcnt++) { + if (REG_UNSET(regstart[mcnt]) + || REG_UNSET(regend[mcnt])) regs->start[mcnt] = + regs->end[mcnt] = -1; + else { + regs->start[mcnt] + = (regoff_t) POINTER_TO_OFFSET(regstart[mcnt]); + regs->end[mcnt] + = (regoff_t) POINTER_TO_OFFSET(regend[mcnt]); + } + } + + /* If the regs structure we return has more elements than + were in the pattern, set the extra elements to -1. If + we (re)allocated the registers, this is the case, + because we always allocate enough to have at least one + -1 at the end. */ + for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; + mcnt++) + regs->start[mcnt] = regs->end[mcnt] = -1; + } + /* regs && !bufp->no_sub */ + DEBUG_PRINT4 + ("%u failure points pushed, %u popped (%u remain).\n", + nfailure_points_pushed, nfailure_points_popped, + nfailure_points_pushed - nfailure_points_popped); + DEBUG_PRINT2("%u registers pushed.\n", num_regs_pushed); + + mcnt = d - pos - (MATCHING_IN_FIRST_STRING + ? string1 : string2 - size1); + + DEBUG_PRINT2("Returning %d from re_match_2.\n", mcnt); + + FREE_VARIABLES(); + return mcnt; + } + + /* Otherwise match next pattern command. */ + switch (SWITCH_ENUM_CAST((re_opcode_t) * p++)) { + /* Ignore these. Used to ignore the n of succeed_n's which + currently have n == 0. */ + case no_op: + DEBUG_PRINT1("EXECUTING no_op.\n"); + break; + + case succeed: + DEBUG_PRINT1("EXECUTING succeed.\n"); + goto succeed_label; + + /* Match the next n pattern characters exactly. The following + byte in the pattern defines n, and the n bytes after that + are the characters to match. */ + case exactn: + mcnt = *p++; + DEBUG_PRINT2("EXECUTING exactn %d.\n", mcnt); + + /* This is written out as an if-else so we don't waste time + testing `translate' inside the loop. */ + if (translate) { + do { + PREFETCH(); + if ((unsigned char) translate[(unsigned char) *d++] + != (unsigned char) *p++) + goto fail; + } + while (--mcnt); + } else { + do { + PREFETCH(); + if (*d++ != (char) *p++) + goto fail; + } + while (--mcnt); + } + SET_REGS_MATCHED(); + break; + + + /* Match any character except possibly a newline or a null. */ + case anychar: + DEBUG_PRINT1("EXECUTING anychar.\n"); + + PREFETCH(); + + if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE(*d) == '\n') + || (bufp->syntax & RE_DOT_NOT_NULL + && TRANSLATE(*d) == '\000')) goto fail; + + SET_REGS_MATCHED(); + DEBUG_PRINT2(" Matched `%d'.\n", *d); + d++; + break; + + + case charset: + case charset_not: + { + register unsigned char c; + boolean not = (re_opcode_t) * (p - 1) == charset_not; + + DEBUG_PRINT2("EXECUTING charset%s.\n", not ? "_not" : ""); + + PREFETCH(); + c = TRANSLATE(*d); /* The character to match. */ + + /* Cast to `unsigned' instead of `unsigned char' in case the + bit list is a full 32 bytes long. */ + if (c < (unsigned) (*p * BYTEWIDTH) + && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + p += 1 + *p; + + if (!not) + goto fail; + + SET_REGS_MATCHED(); + d++; + break; + } + + + /* The beginning of a group is represented by start_memory. + The arguments are the register number in the next byte, and the + number of groups inner to this one in the next. The text + matched within the group is recorded (in the internal + registers data structure) under the register number. */ + case start_memory: + DEBUG_PRINT3("EXECUTING start_memory %d (%d):\n", *p, p[1]); + + /* Find out if this group can match the empty string. */ + p1 = p; /* To send to group_match_null_string_p. */ + + if (REG_MATCH_NULL_STRING_P(reg_info[*p]) == + MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P(reg_info[*p]) = + group_match_null_string_p(&p1, pend, reg_info); + + /* Save the position in the string where we were the last time + we were at this open-group operator in case the group is + operated upon by a repetition operator, e.g., with `(a*)*b' + against `ab'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p]) + ? REG_UNSET(regstart[*p]) ? d : regstart[*p] + : regstart[*p]; + DEBUG_PRINT2(" old_regstart: %d\n", + POINTER_TO_OFFSET(old_regstart[*p])); + + regstart[*p] = d; + DEBUG_PRINT2(" regstart: %d\n", + POINTER_TO_OFFSET(regstart[*p])); + + IS_ACTIVE(reg_info[*p]) = 1; + MATCHED_SOMETHING(reg_info[*p]) = 0; + + /* Clear this whenever we change the register activity status. */ + set_regs_matched_done = 0; + + /* This is the new highest active register. */ + highest_active_reg = *p; + + /* If nothing was active before, this is the new lowest active + register. */ + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *p; + + /* Move past the register number and inner group count. */ + p += 2; + just_past_start_mem = p; + + break; + + + /* The stop_memory opcode represents the end of a group. Its + arguments are the same as start_memory's: the register + number, and the number of inner groups. */ + case stop_memory: + DEBUG_PRINT3("EXECUTING stop_memory %d (%d):\n", *p, p[1]); + + /* We need to save the string position the last time we were at + this close-group operator in case the group is operated + upon by a repetition operator, e.g., with `((a*)*(b*)*)*' + against `aba'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regend[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p]) + ? REG_UNSET(regend[*p]) ? d : regend[*p] + : regend[*p]; + DEBUG_PRINT2(" old_regend: %d\n", + POINTER_TO_OFFSET(old_regend[*p])); + + regend[*p] = d; + DEBUG_PRINT2(" regend: %d\n", + POINTER_TO_OFFSET(regend[*p])); + + /* This register isn't active anymore. */ + IS_ACTIVE(reg_info[*p]) = 0; + + /* Clear this whenever we change the register activity status. */ + set_regs_matched_done = 0; + + /* If this was the only register active, nothing is active + anymore. */ + if (lowest_active_reg == highest_active_reg) { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } else { /* We must scan for the new highest active register, since + it isn't necessarily one less than now: consider + (a(b)c(d(e)f)g). When group 3 ends, after the f), the + new highest active register is 1. */ + unsigned char r = *p - 1; + + while (r > 0 && !IS_ACTIVE(reg_info[r])) + r--; + + /* If we end up at register zero, that means that we saved + the registers as the result of an `on_failure_jump', not + a `start_memory', and we jumped to past the innermost + `stop_memory'. For example, in ((.)*) we save + registers 1 and 2 as a result of the *, but when we pop + back to the second ), we are at the stop_memory 1. + Thus, nothing is active. */ + if (r == 0) { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } else + highest_active_reg = r; + } + + /* If just failed to match something this time around with a + group that's operated on by a repetition operator, try to + force exit from the ``loop'', and restore the register + information for this group that we had before trying this + last match. */ + if ((!MATCHED_SOMETHING(reg_info[*p]) + || just_past_start_mem == p - 1) + && (p + 2) < pend) { + boolean is_a_jump_n = false; + + p1 = p + 2; + mcnt = 0; + switch ((re_opcode_t) * p1++) { + case jump_n: + is_a_jump_n = true; + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + if (is_a_jump_n) + p1 += 2; + break; + + default: + /* do nothing */ ; + } + p1 += mcnt; + + /* If the next operation is a jump backwards in the pattern + to an on_failure_jump right before the start_memory + corresponding to this stop_memory, exit from the loop + by forcing a failure after pushing on the stack the + on_failure_jump's jump in the pattern, and d. */ + if (mcnt < 0 && (re_opcode_t) * p1 == on_failure_jump + && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) { + /* If this group ever matched anything, then restore + what its registers were before trying this last + failed match, e.g., with `(a*)*b' against `ab' for + regstart[1], and, e.g., with `((a*)*(b*)*)*' + against `aba' for regend[3]. + + Also restore the registers for inner groups for, + e.g., `((a*)(b*))*' against `aba' (register 3 would + otherwise get trashed). */ + + if (EVER_MATCHED_SOMETHING(reg_info[*p])) { + unsigned r; + + EVER_MATCHED_SOMETHING(reg_info[*p]) = 0; + + /* Restore this and inner groups' (if any) registers. */ + for (r = *p; + r < (unsigned) *p + (unsigned) *(p + 1); r++) { + regstart[r] = old_regstart[r]; + + /* xx why this test? */ + if (old_regend[r] >= regstart[r]) + regend[r] = old_regend[r]; + } + } + p1++; + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + PUSH_FAILURE_POINT(p1 + mcnt, d, -2); + + goto fail; + } + } + + /* Move past the register number and the inner group count. */ + p += 2; + break; + + + /* \<digit> has been turned into a `duplicate' command which is + followed by the numeric value of <digit> as the register number. */ + case duplicate: + { + register const char *d2, *dend2; + int regno = *p++; /* Get which register to match against. */ + + DEBUG_PRINT2("EXECUTING duplicate %d.\n", regno); + + /* Can't back reference a group which we've never matched. */ + if (REG_UNSET(regstart[regno]) || REG_UNSET(regend[regno])) + goto fail; + + /* Where in input to try to start matching. */ + d2 = regstart[regno]; + + /* Where to stop matching; if both the place to start and + the place to stop matching are in the same string, then + set to the place to stop, otherwise, for now have to use + the end of the first string. */ + + dend2 = ((FIRST_STRING_P(regstart[regno]) + == FIRST_STRING_P(regend[regno])) + ? regend[regno] : end_match_1); + for (;;) { + /* If necessary, advance to next segment in register + contents. */ + while (d2 == dend2) { + if (dend2 == end_match_2) + break; + if (dend2 == regend[regno]) + break; + + /* End of string1 => advance to string2. */ + d2 = string2; + dend2 = regend[regno]; + } + /* At end of register contents => success */ + if (d2 == dend2) + break; + + /* If necessary, advance to next segment in data. */ + PREFETCH(); + + /* How many characters left in this segment to match. */ + mcnt = dend - d; + + /* Want how many consecutive characters we can match in + one shot, so, if necessary, adjust the count. */ + if (mcnt > dend2 - d2) + mcnt = dend2 - d2; + + /* Compare that many; failure if mismatch, else move + past them. */ + if (translate ? bcmp_translate(d, d2, mcnt, translate) + : memcmp(d, d2, mcnt)) + goto fail; + d += mcnt, d2 += mcnt; + + /* Do this because we've match some characters. */ + SET_REGS_MATCHED(); + } + } + break; + + + /* begline matches the empty string at the beginning of the string + (unless `not_bol' is set in `bufp'), and, if + `newline_anchor' is set, after newlines. */ + case begline: + DEBUG_PRINT1("EXECUTING begline.\n"); + + if (AT_STRINGS_BEG(d)) { + if (!bufp->not_bol) + break; + } else if (d[-1] == '\n' && bufp->newline_anchor) { + break; + } + /* In all other cases, we fail. */ + goto fail; + + + /* endline is the dual of begline. */ + case endline: + DEBUG_PRINT1("EXECUTING endline.\n"); + + if (AT_STRINGS_END(d)) { + if (!bufp->not_eol) + break; + } + + /* We have to ``prefetch'' the next character. */ + else if ((d == end1 ? *string2 : *d) == '\n' + && bufp->newline_anchor) { + break; + } + goto fail; + + + /* Match at the very beginning of the data. */ + case begbuf: + DEBUG_PRINT1("EXECUTING begbuf.\n"); + if (AT_STRINGS_BEG(d)) + break; + goto fail; + + + /* Match at the very end of the data. */ + case endbuf: + DEBUG_PRINT1("EXECUTING endbuf.\n"); + if (AT_STRINGS_END(d)) + break; + goto fail; + + + /* on_failure_keep_string_jump is used to optimize `.*\n'. It + pushes NULL as the value for the string on the stack. Then + `pop_failure_point' will keep the current value for the + string, instead of restoring it. To see why, consider + matching `foo\nbar' against `.*\n'. The .* matches the foo; + then the . fails against the \n. But the next thing we want + to do is match the \n against the \n; if we restored the + string value, we would be back at the foo. + + Because this is used only in specific cases, we don't need to + check all the things that `on_failure_jump' does, to make + sure the right things get saved on the stack. Hence we don't + share its code. The only reason to push anything on the + stack at all is that otherwise we would have to change + `anychar's code to do something besides goto fail in this + case; that seems worse than this. */ + case on_failure_keep_string_jump: + DEBUG_PRINT1("EXECUTING on_failure_keep_string_jump"); + + EXTRACT_NUMBER_AND_INCR(mcnt, p); +#ifdef _LIBC + DEBUG_PRINT3(" %d (to %p):\n", mcnt, p + mcnt); +#else + DEBUG_PRINT3(" %d (to 0x%x):\n", mcnt, p + mcnt); +#endif + + PUSH_FAILURE_POINT(p + mcnt, NULL, -2); + break; + + + /* Uses of on_failure_jump: + + Each alternative starts with an on_failure_jump that points + to the beginning of the next alternative. Each alternative + except the last ends with a jump that in effect jumps past + the rest of the alternatives. (They really jump to the + ending jump of the following alternative, because tensioning + these jumps is a hassle.) + + Repeats start with an on_failure_jump that points past both + the repetition text and either the following jump or + pop_failure_jump back to this on_failure_jump. */ + case on_failure_jump: + on_failure: + DEBUG_PRINT1("EXECUTING on_failure_jump"); + + EXTRACT_NUMBER_AND_INCR(mcnt, p); +#ifdef _LIBC + DEBUG_PRINT3(" %d (to %p)", mcnt, p + mcnt); +#else + DEBUG_PRINT3(" %d (to 0x%x)", mcnt, p + mcnt); +#endif + + /* If this on_failure_jump comes right before a group (i.e., + the original * applied to a group), save the information + for that group and all inner ones, so that if we fail back + to this point, the group's information will be correct. + For example, in \(a*\)*\1, we need the preceding group, + and in \(zz\(a*\)b*\)\2, we need the inner group. */ + + /* We can't use `p' to check ahead because we push + a failure point to `p + mcnt' after we do this. */ + p1 = p; + + /* We need to skip no_op's before we look for the + start_memory in case this on_failure_jump is happening as + the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 + against aba. */ + while (p1 < pend && (re_opcode_t) * p1 == no_op) + p1++; + + if (p1 < pend && (re_opcode_t) * p1 == start_memory) { + /* We have a new highest active register now. This will + get reset at the start_memory we are about to get to, + but we will have saved all the registers relevant to + this repetition op, as described above. */ + highest_active_reg = *(p1 + 1) + *(p1 + 2); + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *(p1 + 1); + } + + DEBUG_PRINT1(":\n"); + PUSH_FAILURE_POINT(p + mcnt, d, -2); + break; + + + /* A smart repeat ends with `maybe_pop_jump'. + We change it to either `pop_failure_jump' or `jump'. */ + case maybe_pop_jump: + EXTRACT_NUMBER_AND_INCR(mcnt, p); + DEBUG_PRINT2("EXECUTING maybe_pop_jump %d.\n", mcnt); + { + register unsigned char *p2 = p; + + /* Compare the beginning of the repeat with what in the + pattern follows its end. If we can establish that there + is nothing that they would both match, i.e., that we + would have to backtrack because of (as in, e.g., `a*a') + then we can change to pop_failure_jump, because we'll + never have to backtrack. + + This is not true in the case of alternatives: in + `(a|ab)*' we do need to backtrack to the `ab' alternative + (e.g., if the string was `ab'). But instead of trying to + detect that here, the alternative has put on a dummy + failure point which is what we will end up popping. */ + + /* Skip over open/close-group commands. + If what follows this loop is a ...+ construct, + look at what begins its body, since we will have to + match at least one of that. */ + while (1) { + if (p2 + 2 < pend + && ((re_opcode_t) * p2 == stop_memory + || (re_opcode_t) * p2 == start_memory)) + p2 += 3; + else if (p2 + 6 < pend + && (re_opcode_t) * p2 == dummy_failure_jump) + p2 += 6; + else + break; + } + + p1 = p + mcnt; + /* p1[0] ... p1[2] are the `on_failure_jump' corresponding + to the `maybe_finalize_jump' of this case. Examine what + follows. */ + + /* If we're at the end of the pattern, we can change. */ + if (p2 == pend) { + /* Consider what happens when matching ":\(.*\)" + against ":/". I don't really understand this code + yet. */ + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" End of pattern: change to `pop_failure_jump'.\n"); + } + + else if ((re_opcode_t) * p2 == exactn + || (bufp->newline_anchor + && (re_opcode_t) * p2 == endline)) { + register unsigned char c = + *p2 == (unsigned char) endline ? '\n' : p2[2]; + + if ((re_opcode_t) p1[3] == exactn && p1[5] != c) { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT3(" %c != %c => pop_failure_jump.\n", + c, p1[5]); + } + + else if ((re_opcode_t) p1[3] == charset + || (re_opcode_t) p1[3] == charset_not) { + int not = (re_opcode_t) p1[3] == charset_not; + + if (c < (unsigned char) (p1[4] * BYTEWIDTH) + && p1[5 + + c / BYTEWIDTH] & (1 << (c % + BYTEWIDTH))) not + = !not; + + /* `not' is equal to 1 if c would match, which means + that we can't change to pop_failure_jump. */ + if (!not) { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" No match => pop_failure_jump.\n"); + } + } + } else if ((re_opcode_t) * p2 == charset) { + /* We win if the first character of the loop is not part + of the charset. */ + if ((re_opcode_t) p1[3] == exactn + && !((int) p2[1] * BYTEWIDTH > (int) p1[5] + && (p2[2 + p1[5] / BYTEWIDTH] + & (1 << (p1[5] % BYTEWIDTH))))) { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1(" No match => pop_failure_jump.\n"); + } + + else if ((re_opcode_t) p1[3] == charset_not) { + int idx; + + /* We win if the charset_not inside the loop + lists every character listed in the charset after. */ + for (idx = 0; idx < (int) p2[1]; idx++) + if (!(p2[2 + idx] == 0 || (idx < (int) p1[4] + && + ((p2 + [2 + + idx] & ~p1[5 + + idx]) + == 0)))) + break; + + if (idx == p2[1]) { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" No match => pop_failure_jump.\n"); + } + } else if ((re_opcode_t) p1[3] == charset) { + int idx; + + /* We win if the charset inside the loop + has no overlap with the one after the loop. */ + for (idx = 0; + idx < (int) p2[1] && idx < (int) p1[4]; idx++) + if ((p2[2 + idx] & p1[5 + idx]) != 0) + break; + + if (idx == p2[1] || idx == p1[4]) { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" No match => pop_failure_jump.\n"); + } + } + } + } + p -= 2; /* Point at relative address again. */ + if ((re_opcode_t) p[-1] != pop_failure_jump) { + p[-1] = (unsigned char) jump; + DEBUG_PRINT1(" Match => jump.\n"); + goto unconditional_jump; + } + /* Note fall through. */ + + + /* The end of a simple repeat has a pop_failure_jump back to + its matching on_failure_jump, where the latter will push a + failure point. The pop_failure_jump takes off failure + points put on by this pop_failure_jump's matching + on_failure_jump; we got through the pattern to here from the + matching on_failure_jump, so didn't fail. */ + case pop_failure_jump: + { + /* We need to pass separate storage for the lowest and + highest registers, even though we don't care about the + actual values. Otherwise, we will restore only one + register from the stack, since lowest will == highest in + `pop_failure_point'. */ + active_reg_t dummy_low_reg, dummy_high_reg; + unsigned char *pdummy; + const char *sdummy; + + DEBUG_PRINT1("EXECUTING pop_failure_jump.\n"); + POP_FAILURE_POINT(sdummy, pdummy, + dummy_low_reg, dummy_high_reg, + reg_dummy, reg_dummy, reg_info_dummy); + } + /* Note fall through. */ + + unconditional_jump: +#ifdef _LIBC + DEBUG_PRINT2("\n%p: ", p); +#else + DEBUG_PRINT2("\n0x%x: ", p); +#endif + /* Note fall through. */ + + /* Unconditionally jump (without popping any failure points). */ + case jump: + EXTRACT_NUMBER_AND_INCR(mcnt, p); /* Get the amount to jump. */ + DEBUG_PRINT2("EXECUTING jump %d ", mcnt); + p += mcnt; /* Do the jump. */ +#ifdef _LIBC + DEBUG_PRINT2("(to %p).\n", p); +#else + DEBUG_PRINT2("(to 0x%x).\n", p); +#endif + break; + + + /* We need this opcode so we can detect where alternatives end + in `group_match_null_string_p' et al. */ + case jump_past_alt: + DEBUG_PRINT1("EXECUTING jump_past_alt.\n"); + goto unconditional_jump; + + + /* Normally, the on_failure_jump pushes a failure point, which + then gets popped at pop_failure_jump. We will end up at + pop_failure_jump, also, and with a pattern of, say, `a+', we + are skipping over the on_failure_jump, so we have to push + something meaningless for pop_failure_jump to pop. */ + case dummy_failure_jump: + DEBUG_PRINT1("EXECUTING dummy_failure_jump.\n"); + /* It doesn't matter what we push for the string here. What + the code at `fail' tests is the value for the pattern. */ + PUSH_FAILURE_POINT(NULL, NULL, -2); + goto unconditional_jump; + + + /* At the end of an alternative, we need to push a dummy failure + point in case we are followed by a `pop_failure_jump', because + we don't want the failure point for the alternative to be + popped. For example, matching `(a|ab)*' against `aab' + requires that we match the `ab' alternative. */ + case push_dummy_failure: + DEBUG_PRINT1("EXECUTING push_dummy_failure.\n"); + /* See comments just above at `dummy_failure_jump' about the + two zeroes. */ + PUSH_FAILURE_POINT(NULL, NULL, -2); + break; + + /* Have to succeed matching what follows at least n times. + After that, handle like `on_failure_jump'. */ + case succeed_n: + EXTRACT_NUMBER(mcnt, p + 2); + DEBUG_PRINT2("EXECUTING succeed_n %d.\n", mcnt); + + assert(mcnt >= 0); + /* Originally, this is how many times we HAVE to succeed. */ + if (mcnt > 0) { + mcnt--; + p += 2; + STORE_NUMBER_AND_INCR(p, mcnt); +#ifdef _LIBC + DEBUG_PRINT3(" Setting %p to %d.\n", p - 2, mcnt); +#else + DEBUG_PRINT3(" Setting 0x%x to %d.\n", p - 2, mcnt); +#endif + } else if (mcnt == 0) { +#ifdef _LIBC + DEBUG_PRINT2(" Setting two bytes from %p to no_op.\n", + p + 2); +#else + DEBUG_PRINT2(" Setting two bytes from 0x%x to no_op.\n", + p + 2); +#endif + p[2] = (unsigned char) no_op; + p[3] = (unsigned char) no_op; + goto on_failure; + } + break; + + case jump_n: + EXTRACT_NUMBER(mcnt, p + 2); + DEBUG_PRINT2("EXECUTING jump_n %d.\n", mcnt); + + /* Originally, this is how many times we CAN jump. */ + if (mcnt) { + mcnt--; + STORE_NUMBER(p + 2, mcnt); +#ifdef _LIBC + DEBUG_PRINT3(" Setting %p to %d.\n", p + 2, mcnt); +#else + DEBUG_PRINT3(" Setting 0x%x to %d.\n", p + 2, mcnt); +#endif + goto unconditional_jump; + } + /* If don't have to jump any more, skip over the rest of command. */ + else + p += 4; + break; + + case set_number_at: + { + DEBUG_PRINT1("EXECUTING set_number_at.\n"); + + EXTRACT_NUMBER_AND_INCR(mcnt, p); + p1 = p + mcnt; + EXTRACT_NUMBER_AND_INCR(mcnt, p); +#ifdef _LIBC + DEBUG_PRINT3(" Setting %p to %d.\n", p1, mcnt); +#else + DEBUG_PRINT3(" Setting 0x%x to %d.\n", p1, mcnt); +#endif + STORE_NUMBER(p1, mcnt); + break; + } + +#if 0 + /* The DEC Alpha C compiler 3.x generates incorrect code for the + test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of + AT_WORD_BOUNDARY, so this code is disabled. Expanding the + macro and introducing temporary variables works around the bug. */ + + case wordbound: + DEBUG_PRINT1("EXECUTING wordbound.\n"); + if (AT_WORD_BOUNDARY(d)) + break; + goto fail; + + case notwordbound: + DEBUG_PRINT1("EXECUTING notwordbound.\n"); + if (AT_WORD_BOUNDARY(d)) + goto fail; + break; +#else + case wordbound: + { + boolean prevchar, thischar; + + DEBUG_PRINT1("EXECUTING wordbound.\n"); + if (AT_STRINGS_BEG(d) || AT_STRINGS_END(d)) + break; + + prevchar = WORDCHAR_P(d - 1); + thischar = WORDCHAR_P(d); + if (prevchar != thischar) + break; + goto fail; + } + + case notwordbound: + { + boolean prevchar, thischar; + + DEBUG_PRINT1("EXECUTING notwordbound.\n"); + if (AT_STRINGS_BEG(d) || AT_STRINGS_END(d)) + goto fail; + + prevchar = WORDCHAR_P(d - 1); + thischar = WORDCHAR_P(d); + if (prevchar != thischar) + goto fail; + break; + } +#endif + + case wordbeg: + DEBUG_PRINT1("EXECUTING wordbeg.\n"); + if (WORDCHAR_P(d) && (AT_STRINGS_BEG(d) || !WORDCHAR_P(d - 1))) + break; + goto fail; + + case wordend: + DEBUG_PRINT1("EXECUTING wordend.\n"); + if (!AT_STRINGS_BEG(d) && WORDCHAR_P(d - 1) + && (!WORDCHAR_P(d) || AT_STRINGS_END(d))) + break; + goto fail; + +#ifdef emacs + case before_dot: + DEBUG_PRINT1("EXECUTING before_dot.\n"); + if (PTR_CHAR_POS((unsigned char *) d) >= point) + goto fail; + break; + + case at_dot: + DEBUG_PRINT1("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS((unsigned char *) d) != point) + goto fail; + break; + + case after_dot: + DEBUG_PRINT1("EXECUTING after_dot.\n"); + if (PTR_CHAR_POS((unsigned char *) d) <= point) + goto fail; + break; + + case syntaxspec: + DEBUG_PRINT2("EXECUTING syntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchsyntax; + + case wordchar: + DEBUG_PRINT1("EXECUTING Emacs wordchar.\n"); + mcnt = (int) Sword; + matchsyntax: + PREFETCH(); + /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ + d++; + if (SYNTAX(d[-1]) != (enum syntaxcode) mcnt) + goto fail; + SET_REGS_MATCHED(); + break; + + case notsyntaxspec: + DEBUG_PRINT2("EXECUTING notsyntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchnotsyntax; + + case notwordchar: + DEBUG_PRINT1("EXECUTING Emacs notwordchar.\n"); + mcnt = (int) Sword; + matchnotsyntax: + PREFETCH(); + /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ + d++; + if (SYNTAX(d[-1]) == (enum syntaxcode) mcnt) + goto fail; + SET_REGS_MATCHED(); + break; + +#else /* not emacs */ + case wordchar: + DEBUG_PRINT1("EXECUTING non-Emacs wordchar.\n"); + PREFETCH(); + if (!WORDCHAR_P(d)) + goto fail; + SET_REGS_MATCHED(); + d++; + break; + + case notwordchar: + DEBUG_PRINT1("EXECUTING non-Emacs notwordchar.\n"); + PREFETCH(); + if (WORDCHAR_P(d)) + goto fail; + SET_REGS_MATCHED(); + d++; + break; +#endif /* not emacs */ + + default: + abort(); + } + continue; /* Successfully executed one pattern command; keep going. */ + + + /* We goto here if a matching operation fails. */ + fail: + if (!FAIL_STACK_EMPTY()) { /* A restart point is known. Restore to that state. */ + DEBUG_PRINT1("\nFAIL:\n"); + POP_FAILURE_POINT(d, p, + lowest_active_reg, highest_active_reg, + regstart, regend, reg_info); + + /* If this failure point is a dummy, try the next one. */ + if (!p) + goto fail; + + /* If we failed to the end of the pattern, don't examine *p. */ + assert(p <= pend); + if (p < pend) { + boolean is_a_jump_n = false; + + /* If failed to a backwards jump that's part of a repetition + loop, need to pop this failure point and use the next one. */ + switch ((re_opcode_t) * p) { + case jump_n: + is_a_jump_n = true; + case maybe_pop_jump: + case pop_failure_jump: + case jump: + p1 = p + 1; + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + p1 += mcnt; + + if ((is_a_jump_n && (re_opcode_t) * p1 == succeed_n) + || (!is_a_jump_n + && (re_opcode_t) * p1 == on_failure_jump)) + goto fail; + break; + default: + /* do nothing */ ; + } + } + + if (d >= string1 && d <= end1) + dend = end_match_1; + } else + break; /* Matching at this starting point really fails. */ + } /* for (;;) */ + + if (best_regs_set) + goto restore_best_regs; + + FREE_VARIABLES(); + + return -1; /* Failure to match. */ +} /* re_match_2 */ + +/* Subroutine definitions for re_match_2. */ + + +/* We are passed P pointing to a register number after a start_memory. + + Return true if the pattern up to the corresponding stop_memory can + match the empty string, and false otherwise. + + If we find the matching stop_memory, sets P to point to one past its number. + Otherwise, sets P to an undefined byte less than or equal to END. + + We don't handle duplicates properly (yet). */ + +static boolean group_match_null_string_p(p, end, reg_info) +unsigned char **p, *end; +register_info_type *reg_info; +{ + int mcnt; + + /* Point to after the args to the start_memory. */ + unsigned char *p1 = *p + 2; + + while (p1 < end) { + /* Skip over opcodes that can match nothing, and return true or + false, as appropriate, when we get to one that can't, or to the + matching stop_memory. */ + + switch ((re_opcode_t) * p1) { + /* Could be either a loop or a series of alternatives. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + + /* If the next operation is not a jump backwards in the + pattern. */ + + if (mcnt >= 0) { + /* Go through the on_failure_jumps of the alternatives, + seeing if any of the alternatives cannot match nothing. + The last alternative starts with only a jump, + whereas the rest start with on_failure_jump and end + with a jump, e.g., here is the pattern for `a|b|c': + + /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 + /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 + /exactn/1/c + + So, we have to first go through the first (n-1) + alternatives and then deal with the last one separately. */ + + + /* Deal with the first (n-1) alternatives, which start + with an on_failure_jump (see above) that jumps to right + past a jump_past_alt. */ + + while ((re_opcode_t) p1[mcnt - 3] == jump_past_alt) { + /* `mcnt' holds how many bytes long the alternative + is, including the ending `jump_past_alt' and + its number. */ + + if (!alt_match_null_string_p(p1, p1 + mcnt - 3, + reg_info)) return false; + + /* Move to right after this alternative, including the + jump_past_alt. */ + p1 += mcnt; + + /* Break if it's the beginning of an n-th alternative + that doesn't begin with an on_failure_jump. */ + if ((re_opcode_t) * p1 != on_failure_jump) + break; + + /* Still have to check that it's not an n-th + alternative that starts with an on_failure_jump. */ + p1++; + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + if ((re_opcode_t) p1[mcnt - 3] != jump_past_alt) { + /* Get to the beginning of the n-th alternative. */ + p1 -= 3; + break; + } + } + + /* Deal with the last alternative: go back and get number + of the `jump_past_alt' just before it. `mcnt' contains + the length of the alternative. */ + EXTRACT_NUMBER(mcnt, p1 - 2); + + if (!alt_match_null_string_p(p1, p1 + mcnt, reg_info)) + return false; + + p1 += mcnt; /* Get past the n-th alternative. */ + } /* if mcnt > 0 */ + break; + + + case stop_memory: + assert(p1[1] == **p); + *p = p1 + 2; + return true; + + + default: + if (!common_op_match_null_string_p(&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return false; +} /* group_match_null_string_p */ + + +/* Similar to group_match_null_string_p, but doesn't deal with alternatives: + It expects P to be the first byte of a single alternative and END one + byte past the last. The alternative can contain groups. */ + +static boolean alt_match_null_string_p(p, end, reg_info) +unsigned char *p, *end; +register_info_type *reg_info; +{ + int mcnt; + unsigned char *p1 = p; + + while (p1 < end) { + /* Skip over opcodes that can match nothing, and break when we get + to one that can't. */ + + switch ((re_opcode_t) * p1) { + /* It's a loop. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + p1 += mcnt; + break; + + default: + if (!common_op_match_null_string_p(&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return true; +} /* alt_match_null_string_p */ + + +/* Deals with the ops common to group_match_null_string_p and + alt_match_null_string_p. + + Sets P to one after the op and its arguments, if any. */ + +static boolean common_op_match_null_string_p(p, end, reg_info) +unsigned char **p, *end; +register_info_type *reg_info; +{ + int mcnt; + boolean ret; + int reg_no; + unsigned char *p1 = *p; + + switch ((re_opcode_t) * p1++) { + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbeg: + case wordend: + case wordbound: + case notwordbound: +#ifdef emacs + case before_dot: + case at_dot: + case after_dot: +#endif + break; + + case start_memory: + reg_no = *p1; + assert(reg_no > 0 && reg_no <= MAX_REGNUM); + ret = group_match_null_string_p(&p1, end, reg_info); + + /* Have to set this here in case we're checking a group which + contains a group and a back reference to it. */ + + if (REG_MATCH_NULL_STRING_P(reg_info[reg_no]) == + MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P(reg_info[reg_no]) = ret; + + if (!ret) + return false; + break; + + /* If this is an optimized succeed_n for zero times, make the jump. */ + case jump: + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + if (mcnt >= 0) + p1 += mcnt; + else + return false; + break; + + case succeed_n: + /* Get to the number of times to succeed. */ + p1 += 2; + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + + if (mcnt == 0) { + p1 -= 4; + EXTRACT_NUMBER_AND_INCR(mcnt, p1); + p1 += mcnt; + } else + return false; + break; + + case duplicate: + if (!REG_MATCH_NULL_STRING_P(reg_info[*p1])) + return false; + break; + + case set_number_at: + p1 += 4; + + default: + /* All other opcodes mean we cannot match the empty string. */ + return false; + } + + *p = p1; + return true; +} /* common_op_match_null_string_p */ + + +/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN + bytes; nonzero otherwise. */ + +static int bcmp_translate(s1, s2, len, translate) +const char *s1, *s2; +register int len; +RE_TRANSLATE_TYPE translate; +{ + register const unsigned char *p1 = (const unsigned char *) s1; + register const unsigned char *p2 = (const unsigned char *) s2; + + while (len) { + if (translate[*p1++] != translate[*p2++]) + return 1; + len--; + } + return 0; +} + +/* Entry points for GNU code. */ + +/* re_compile_pattern is the GNU regular expression compiler: it + compiles PATTERN (of length SIZE) and puts the result in BUFP. + Returns 0 if the pattern was valid, otherwise an error string. + + Assumes the `allocated' (and perhaps `buffer') and `translate' fields + are set in BUFP on entry. + + We call regex_compile to do the actual compilation. */ + +const char *re_compile_pattern(pattern, length, bufp) +const char *pattern; +size_t length; +struct re_pattern_buffer *bufp; +{ + reg_errcode_t ret; + + /* GNU code is written to assume at least RE_NREGS registers will be set + (and at least one extra will be -1). */ + bufp->regs_allocated = REGS_UNALLOCATED; + + /* And GNU code determines whether or not to get register information + by passing null for the REGS argument to re_match, etc., not by + setting no_sub. */ + bufp->no_sub = 0; + + /* Match anchors at newline. */ + bufp->newline_anchor = 1; + + ret = regex_compile(pattern, length, re_syntax_options, bufp); + + if (!ret) + return NULL; + return gettext(re_error_msgid + re_error_msgid_idx[(int) ret]); +} + +#ifdef _LIBC +weak_alias(__re_compile_pattern, re_compile_pattern) +#endif +/* Entry points compatible with 4.2 BSD regex library. We don't define + them unless specifically requested. */ +#if defined _REGEX_RE_COMP || defined _LIBC +/* BSD has one and only one pattern buffer. */ +static struct re_pattern_buffer re_comp_buf; + +char * +#ifdef _LIBC +/* Make these definitions weak in libc, so POSIX programs can redefine + these names if they don't use our functions, and still use + regcomp/regexec below without link errors. */ weak_function +#endif +re_comp(s) +const char *s; +{ + reg_errcode_t ret; + + if (!s) { + if (!re_comp_buf.buffer) + return gettext("No previous regular expression"); + return 0; + } + + if (!re_comp_buf.buffer) { + re_comp_buf.buffer = (unsigned char *) malloc(200); + if (re_comp_buf.buffer == NULL) + return (char *) gettext(re_error_msgid + + + re_error_msgid_idx[(int) REG_ESPACE]); + re_comp_buf.allocated = 200; + + re_comp_buf.fastmap = (char *) malloc(1 << BYTEWIDTH); + if (re_comp_buf.fastmap == NULL) + return (char *) gettext(re_error_msgid + + + re_error_msgid_idx[(int) REG_ESPACE]); + } + + /* Since `re_exec' always passes NULL for the `regs' argument, we + don't need to initialize the pattern buffer fields which affect it. */ + + /* Match anchors at newlines. */ + re_comp_buf.newline_anchor = 1; + + ret = regex_compile(s, strlen(s), re_syntax_options, &re_comp_buf); + + if (!ret) + return NULL; + + /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ + return (char *) gettext(re_error_msgid + + re_error_msgid_idx[(int) ret]); +} + + +int +#ifdef _LIBC + weak_function +#endif +re_exec(s) +const char *s; +{ + const int len = strlen(s); + + return + 0 <= re_search(&re_comp_buf, s, len, 0, len, + (struct re_registers *) 0); +} + +#endif /* _REGEX_RE_COMP */ + +/* POSIX.2 functions. Don't define these for Emacs. */ + +#ifndef emacs + +/* regcomp takes a regular expression as a string and compiles it. + + PREG is a regex_t *. We do not expect any fields to be initialized, + since POSIX says we shouldn't. Thus, we set + + `buffer' to the compiled pattern; + `used' to the length of the compiled pattern; + `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + REG_EXTENDED bit in CFLAGS is set; otherwise, to + RE_SYNTAX_POSIX_BASIC; + `newline_anchor' to REG_NEWLINE being set in CFLAGS; + `fastmap' to an allocated space for the fastmap; + `fastmap_accurate' to zero; + `re_nsub' to the number of subexpressions in PATTERN. + + PATTERN is the address of the pattern string. + + CFLAGS is a series of bits which affect compilation. + + If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we + use POSIX basic syntax. + + If REG_NEWLINE is set, then . and [^...] don't match newline. + Also, regexec will try a match beginning after every newline. + + If REG_ICASE is set, then we considers upper- and lowercase + versions of letters to be equivalent when matching. + + If REG_NOSUB is set, then when PREG is passed to regexec, that + routine will report only success or failure, and nothing about the + registers. + + It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for + the return codes and their meanings.) */ + +int regcomp(preg, pattern, cflags) +regex_t *preg; +const char *pattern; +int cflags; +{ + reg_errcode_t ret; + reg_syntax_t syntax + = (cflags & REG_EXTENDED) ? + + RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; + + /* regex_compile will allocate the space for the compiled pattern. */ + preg->buffer = 0; + preg->allocated = 0; + preg->used = 0; + + /* Try to allocate space for the fastmap. */ + preg->fastmap = (char *) malloc(1 << BYTEWIDTH); + + if (cflags & REG_ICASE) { + unsigned i; + + preg->translate + = (RE_TRANSLATE_TYPE) malloc(CHAR_SET_SIZE + * sizeof(*(RE_TRANSLATE_TYPE) 0)); + if (preg->translate == NULL) + return (int) REG_ESPACE; + + /* Map uppercase characters to corresponding lowercase ones. */ + for (i = 0; i < CHAR_SET_SIZE; i++) + preg->translate[i] = ISUPPER(i) ? TOLOWER(i) : i; + } else + preg->translate = NULL; + + /* If REG_NEWLINE is set, newlines are treated differently. */ + if (cflags & REG_NEWLINE) { /* REG_NEWLINE implies neither . nor [^...] match newline. */ + syntax &= ~RE_DOT_NEWLINE; + syntax |= RE_HAT_LISTS_NOT_NEWLINE; + /* It also changes the matching behavior. */ + preg->newline_anchor = 1; + } else + preg->newline_anchor = 0; + + preg->no_sub = !!(cflags & REG_NOSUB); + + /* POSIX says a null character in the pattern terminates it, so we + can use strlen here in compiling the pattern. */ + ret = regex_compile(pattern, strlen(pattern), syntax, preg); + + /* POSIX doesn't distinguish between an unmatched open-group and an + unmatched close-group: both are REG_EPAREN. */ + if (ret == REG_ERPAREN) + ret = REG_EPAREN; + + if (ret == REG_NOERROR && preg->fastmap) { + /* Compute the fastmap now, since regexec cannot modify the pattern + buffer. */ + if (re_compile_fastmap(preg) == -2) { + /* Some error occurred while computing the fastmap, just forget + about it. */ + free(preg->fastmap); + preg->fastmap = NULL; + } + } + + return (int) ret; +} + +#ifdef _LIBC +weak_alias(__regcomp, regcomp) +#endif +/* regexec searches for a given pattern, specified by PREG, in the + string STRING. + + If NMATCH is zero or REG_NOSUB was set in the cflags argument to + `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at + least NMATCH elements, and we set them to the offsets of the + corresponding matched substrings. + + EFLAGS specifies `execution flags' which affect matching: if + REG_NOTBOL is set, then ^ does not match at the beginning of the + string; if REG_NOTEOL is set, then $ does not match at the end. + + We return 0 if we find a match and REG_NOMATCH if not. */ +int regexec(preg, string, nmatch, pmatch, eflags) +const regex_t *preg; +const char *string; +size_t nmatch; +regmatch_t pmatch[]; +int eflags; +{ + int ret; + struct re_registers regs; + regex_t private_preg; + int len = strlen(string); + boolean want_reg_info = !preg->no_sub && nmatch > 0; + + private_preg = *preg; + + private_preg.not_bol = !!(eflags & REG_NOTBOL); + private_preg.not_eol = !!(eflags & REG_NOTEOL); + + /* The user has told us exactly how many registers to return + information about, via `nmatch'. We have to pass that on to the + matching routines. */ + private_preg.regs_allocated = REGS_FIXED; + + if (want_reg_info) { + regs.num_regs = nmatch; + regs.start = TALLOC(nmatch * 2, regoff_t); + if (regs.start == NULL) + return (int) REG_NOMATCH; + regs.end = regs.start + nmatch; + } + + /* Perform the searching operation. */ + ret = re_search(&private_preg, string, len, + /* start: */ 0, /* range: */ len, + want_reg_info ? ®s : (struct re_registers *) 0); + + /* Copy the register information to the POSIX structure. */ + if (want_reg_info) { + if (ret >= 0) { + unsigned r; + + for (r = 0; r < nmatch; r++) { + pmatch[r].rm_so = regs.start[r]; + pmatch[r].rm_eo = regs.end[r]; + } + } + + /* If we needed the temporary register info, free the space now. */ + free(regs.start); + } + + /* We want zero return to mean success, unlike `re_search'. */ + return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; +} + +#ifdef _LIBC +weak_alias(__regexec, regexec) +#endif +/* Returns a message corresponding to an error code, ERRCODE, returned + from either regcomp or regexec. We don't use PREG here. */ + size_t regerror(errcode, preg, errbuf, errbuf_size) +int errcode; +const regex_t *preg; +char *errbuf; +size_t errbuf_size; +{ + const char *msg; + size_t msg_size; + + if (errcode < 0 || errcode >= (int) (sizeof(re_error_msgid_idx) + / sizeof(re_error_msgid_idx[0]))) + /* Only error codes returned by the rest of the code should be passed + to this routine. If we are given anything else, or if other regex + code generates an invalid error code, then the program has a bug. + Dump core so we can fix it. */ + abort(); + + msg = gettext(re_error_msgid + re_error_msgid_idx[errcode]); + + msg_size = strlen(msg) + 1; /* Includes the null. */ + + if (errbuf_size != 0) { + if (msg_size > errbuf_size) { +#if defined HAVE_MEMPCPY || defined _LIBC + *((char *) __mempcpy(errbuf, msg, errbuf_size - 1)) = '\0'; +#else + memcpy(errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = 0; +#endif + } else + memcpy(errbuf, msg, msg_size); + } + + return msg_size; +} + +#ifdef _LIBC +weak_alias(__regerror, regerror) +#endif +/* Free dynamically allocated space used by PREG. */ +void regfree(preg) +regex_t *preg; +{ + if (preg->buffer != NULL) + free(preg->buffer); + preg->buffer = NULL; + + preg->allocated = 0; + preg->used = 0; + + if (preg->fastmap != NULL) + free(preg->fastmap); + preg->fastmap = NULL; + preg->fastmap_accurate = 0; + + if (preg->translate != NULL) + free(preg->translate); + preg->translate = NULL; +} + +#ifdef _LIBC +weak_alias(__regfree, regfree) +#endif +#endif /* not emacs */ diff --git a/libc/misc/regex/rx.c b/libc/misc/regex/rx.c deleted file mode 100644 index 39f77adb6..000000000 --- a/libc/misc/regex/rx.c +++ /dev/null @@ -1,7273 +0,0 @@ -/* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc. - -This file is part of the librx library. - -Librx is free software; you can redistribute it and/or modify it under -the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -Librx is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU Library General Public -License along with this software; see the file COPYING.LIB. If not, -write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA -02139, USA. */ - -/* NOTE!!! AIX is so losing it requires this to be the first thing in the - * file. - * Do not put ANYTHING before it! - */ -#if !defined (__GNUC__) && defined (_AIX) -#pragma alloca -#endif - -/* To make linux happy? */ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - - -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <ctype.h> -#ifndef isgraph -#define isgraph(c) (isprint (c) && !isspace (c)) -#endif -#ifndef isblank -#define isblank(c) ((c) == ' ' || (c) == '\t') -#endif - -#include <sys/types.h> - -#undef MAX -#undef MIN -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -typedef char boolean; - -#define false 0 -#define true 1 - -#ifndef __GCC__ -#undef __inline__ -#define __inline__ -#endif - -/* Emacs already defines alloca, sometimes. */ -#ifndef alloca - -/* Make alloca work the best possible way. */ -#ifdef __GNUC__ -#define alloca __builtin_alloca -#else /* not __GNUC__ */ -#if HAVE_ALLOCA_H -#include <alloca.h> -#else /* not __GNUC__ or HAVE_ALLOCA_H */ -#ifndef _AIX /* Already did AIX, up at the top. */ -char *alloca(); -#endif /* not _AIX */ -#endif /* not HAVE_ALLOCA_H */ -#endif /* not __GNUC__ */ - -#endif /* not alloca */ - -/* Memory management and stuff for emacs. */ - -#define CHARBITS 8 -#define remalloc(M, S) (M ? realloc (M, S) : malloc (S)) - - -/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we - * use `alloca' instead of `malloc' for the backtracking stack. - * - * Emacs will die miserably if we don't do this. - */ - -#ifdef REGEX_MALLOC -#define REGEX_ALLOCATE malloc -#else /* not REGEX_MALLOC */ -#define REGEX_ALLOCATE alloca -#endif /* not REGEX_MALLOC */ - - -#ifdef RX_WANT_RX_DEFS -#define RX_DECL extern -#define RX_DEF_QUAL -#else -#define RX_WANT_RX_DEFS -#define RX_DECL static -#define RX_DEF_QUAL static -#endif - -#include <regex.h> -#undef RX_DECL -#define RX_DECL RX_DEF_QUAL - - -/* - * Prototypes. - */ -#ifdef __STDC__ -RX_DECL struct rx_hash_item -*rx_hash_find(struct rx_hash *, unsigned long, - - void *, struct rx_hash_rules *); -RX_DECL struct rx_hash_item -*rx_hash_find(struct rx_hash *, unsigned long, - - void *, struct rx_hash_rules *); -RX_DECL struct rx_hash_item -*rx_hash_store(struct rx_hash *, unsigned long, - - void *, struct rx_hash_rules *); -RX_DECL void rx_hash_free(struct rx_hash_item *, struct rx_hash_rules *); -RX_DECL void rx_free_hash_table(struct rx_hash *, rx_hash_freefn, - - struct rx_hash_rules *); -RX_DECL rx_Bitset rx_cset(struct rx *); -RX_DECL rx_Bitset rx_copy_cset(struct rx *, rx_Bitset); -RX_DECL void rx_free_cset(struct rx *, rx_Bitset); -static struct rx_hash_item -*compiler_hash_item_alloc(struct rx_hash_rules *, void *); -static struct rx_hash -*compiler_hash_alloc(struct rx_hash_rules *); -static void compiler_free_hash(struct rx_hash *, struct rx_hash_rules *); -static void compiler_free_hash_item(struct rx_hash_item *, - - struct rx_hash_rules *); -RX_DECL struct rexp_node -*rexp_node(struct rx *, enum rexp_node_type); -RX_DECL struct rexp_node -*rx_mk_r_cset(struct rx *, rx_Bitset); -RX_DECL struct rexp_node -*rx_mk_r_concat(struct rx *, struct rexp_node *, struct rexp_node *); -RX_DECL struct rexp_node -*rx_mk_r_alternate(struct rx *, struct rexp_node *, struct rexp_node *); -RX_DECL struct rexp_node -*rx_mk_r_alternate(struct rx *, struct rexp_node *, struct rexp_node *); -RX_DECL struct rexp_node -*rx_mk_r_opt(struct rx *, struct rexp_node *); -RX_DECL struct rexp_node -*rx_mk_r_star(struct rx *, struct rexp_node *); -RX_DECL struct rexp_node -*rx_mk_r_2phase_star(struct rx *, struct rexp_node *, struct rexp_node *); -RX_DECL struct rexp_node -*rx_mk_r_side_effect(struct rx *, rx_side_effect); - -//RX_DECL struct rexp_node -// *rx_mk_r_data (struct rx *, void *); -RX_DECL void rx_free_rexp(struct rx *, struct rexp_node *); -RX_DECL struct rexp_node -*rx_copy_rexp(struct rx *, struct rexp_node *); -RX_DECL struct rx_nfa_state -*rx_nfa_state(struct rx *); -RX_DECL void rx_free_nfa_state(struct rx_nfa_state *); -RX_DECL struct rx_nfa_state -*rx_id_to_nfa_state(struct rx *, int); -RX_DECL struct rx_nfa_edge -*rx_nfa_edge(struct rx *, enum rx_nfa_etype, - - struct rx_nfa_state *, struct rx_nfa_state *); -RX_DECL void rx_free_nfa_edge(struct rx_nfa_edge *); -static struct rx_possible_future -*rx_possible_future(struct rx *, struct rx_se_list *); -static void rx_free_possible_future(struct rx_possible_future *); -RX_DECL void rx_free_nfa(struct rx *); -RX_DECL int rx_build_nfa(struct rx *, struct rexp_node *, - struct rx_nfa_state **, struct rx_nfa_state **); -RX_DECL void rx_name_nfa_states(struct rx *); -static int se_list_cmp(void *, void *); -static int se_list_equal(void *, void *); -static struct rx_se_list -*hash_cons_se_prog(struct rx *, struct rx_hash *, - - void *, struct rx_se_list *); -static struct rx_se_list -*hash_se_prog(struct rx *, struct rx_hash *, struct rx_se_list *); -static int nfa_set_cmp(void *, void *); -static int nfa_set_equal(void *, void *); -static struct rx_nfa_state_set -*nfa_set_cons(struct rx *, struct rx_hash *, - - struct rx_nfa_state *, struct rx_nfa_state_set *); -static struct rx_nfa_state_set -*nfa_set_enjoin(struct rx *, struct rx_hash *, - - struct rx_nfa_state *, struct rx_nfa_state_set *); -#endif - -#ifndef emacs - -#ifdef SYNTAX_TABLE -extern char *re_syntax_table; -#else /* not SYNTAX_TABLE */ - -#ifndef RX_WANT_RX_DEFS -RX_DECL char re_syntax_table[CHAR_SET_SIZE]; -#endif - -#ifdef __STDC__ -static void init_syntax_once(void) -#else -static void init_syntax_once() -#endif -{ - register int c; - static int done = 0; - - if (done) - return; - - bzero(re_syntax_table, sizeof re_syntax_table); - - for (c = 'a'; c <= 'z'; c++) - re_syntax_table[c] = Sword; - - for (c = 'A'; c <= 'Z'; c++) - re_syntax_table[c] = Sword; - - for (c = '0'; c <= '9'; c++) - re_syntax_table[c] = Sword; - - re_syntax_table['_'] = Sword; - - done = 1; -} -#endif /* not SYNTAX_TABLE */ -#endif /* not emacs */ - -/* Compile with `-DRX_DEBUG' and use the following flags. - * - * Debugging flags: - * rx_debug - print information as a regexp is compiled - * rx_debug_trace - print information as a regexp is executed - */ - -#ifdef RX_DEBUG - -int rx_debug_compile = 0; -int rx_debug_trace = 0; -static struct re_pattern_buffer *dbug_rxb = 0; - - -/* - * More Prototypes - */ -#ifdef __STDC__ -typedef void (*side_effect_printer) (struct rx *, void *, FILE *); -static void print_cset(struct rx *, rx_Bitset, FILE *); -static void print_rexp(struct rx *, struct rexp_node *, int, - side_effect_printer, FILE *); -static void print_nfa(struct rx *, struct rx_nfa_state *, - side_effect_printer, FILE *); -static void re_seprint(struct rx *, void *, FILE *); -void print_compiled_pattern(struct re_pattern_buffer *); -void print_fastmap(char *); -#else -typedef void (*side_effect_printer) (); -static void print_cset(); -#endif - -#ifdef __STDC__ -static void -print_rexp(struct rx *rx, - struct rexp_node *node, int depth, - side_effect_printer seprint, FILE * fp) -#else -static void print_rexp(rx, node, depth, seprint, fp) -struct rx *rx; -struct rexp_node *node; -int depth; -side_effect_printer seprint; -FILE *fp; -#endif -{ - if (!node) - return; - else { - switch (node->type) { - case r_cset: - { - fprintf(fp, "%*s", depth, ""); - print_cset(rx, node->params.cset, fp); - fputc('\n', fp); - break; - } - - case r_opt: - case r_star: - fprintf(fp, "%*s%s\n", depth, "", - node->type == r_opt ? "opt" : "star"); - print_rexp(rx, node->params.pair.left, depth + 3, seprint, fp); - break; - - case r_2phase_star: - fprintf(fp, "%*s2phase star\n", depth, ""); - print_rexp(rx, node->params.pair.right, depth + 3, seprint, - fp); - print_rexp(rx, node->params.pair.left, depth + 3, seprint, fp); - break; - - - case r_alternate: - case r_concat: - fprintf(fp, "%*s%s\n", depth, "", - node->type == r_alternate ? "alt" : "concat"); - print_rexp(rx, node->params.pair.left, depth + 3, seprint, fp); - print_rexp(rx, node->params.pair.right, depth + 3, seprint, - fp); - break; - case r_side_effect: - fprintf(fp, "%*sSide effect: ", depth, ""); - seprint(rx, node->params.side_effect, fp); - fputc('\n', fp); - } - } -} - -#ifdef __STDC__ -static void -print_nfa(struct rx *rx, - struct rx_nfa_state *n, side_effect_printer seprint, FILE * fp) -#else -static void print_nfa(rx, n, seprint, fp) -struct rx *rx; -struct rx_nfa_state *n; -side_effect_printer seprint; -FILE *fp; -#endif -{ - while (n) { - struct rx_nfa_edge *e = n->edges; - struct rx_possible_future *ec = n->futures; - - fprintf(fp, "node %d %s\n", n->id, - n->is_final ? "final" : (n->is_start ? "start" : "")); - while (e) { - fprintf(fp, " edge to %d, ", e->dest->id); - switch (e->type) { - case ne_epsilon: - fprintf(fp, "epsilon\n"); - break; - case ne_side_effect: - fprintf(fp, "side effect "); - seprint(rx, e->params.side_effect, fp); - fputc('\n', fp); - break; - case ne_cset: - fprintf(fp, "cset "); - print_cset(rx, e->params.cset, fp); - fputc('\n', fp); - break; - } - e = e->next; - } - - while (ec) { - int x; - struct rx_nfa_state_set *s; - struct rx_se_list *l; - - fprintf(fp, " eclosure to {"); - for (s = ec->destset; s; s = s->cdr) - fprintf(fp, "%d ", s->car->id); - fprintf(fp, "} ("); - for (l = ec->effects; l; l = l->cdr) { - seprint(rx, l->car, fp); - fputc(' ', fp); - } - fprintf(fp, ")\n"); - ec = ec->next; - } - n = n->next; - } -} - -static char *efnames[] = { - "bogon", - "re_se_try", - "re_se_pushback", - "re_se_push0", - "re_se_pushpos", - "re_se_chkpos", - "re_se_poppos", - "re_se_at_dot", - "re_se_syntax", - "re_se_not_syntax", - "re_se_begbuf", - "re_se_hat", - "re_se_wordbeg", - "re_se_wordbound", - "re_se_notwordbound", - "re_se_wordend", - "re_se_endbuf", - "re_se_dollar", - "re_se_fail", -}; - -static char *efnames2[] = { - "re_se_win", - "re_se_lparen", - "re_se_rparen", - "re_se_backref", - "re_se_iter", - "re_se_end_iter", - "re_se_tv" -}; - -static char *inx_names[] = { - "rx_backtrack_point", - "rx_do_side_effects", - "rx_cache_miss", - "rx_next_char", - "rx_backtrack", - "rx_error_inx", - "rx_num_instructions" -}; - - -#ifdef __STDC__ -static void re_seprint(struct rx *rx, void *effect, FILE * fp) -#else -static void re_seprint(rx, effect, fp) -struct rx *rx; -void *effect; -FILE *fp; -#endif -{ - if ((int) effect < 0) - fputs(efnames[-(int) effect], fp); - else if (dbug_rxb) { - struct re_se_params *p = &dbug_rxb->se_params[(int) effect]; - - fprintf(fp, "%s(%d,%d)", efnames2[p->se], p->op1, p->op2); - } else - fprintf(fp, "[complex op # %d]", (int) effect); -} - -/* These are so the regex.c regression tests will compile. */ -void print_compiled_pattern(rxb) -struct re_pattern_buffer *rxb; -{ -} - -void print_fastmap(fm) -char *fm; -{ -} - -#endif /* RX_DEBUG */ - - - -/* This page: Bitsets. Completely unintersting. */ - -//RX_DECL int rx_bitset_is_equal (int, rx_Bitset, rx_Bitset); -RX_DECL int rx_bitset_is_subset(int, rx_Bitset, rx_Bitset); - -//RX_DECL int rx_bitset_empty (int, rx_Bitset); -RX_DECL void rx_bitset_null(int, rx_Bitset); -RX_DECL void rx_bitset_complement(int, rx_Bitset); -RX_DECL void rx_bitset_complement(int, rx_Bitset); -RX_DECL void rx_bitset_assign(int, rx_Bitset, rx_Bitset); -RX_DECL void rx_bitset_union(int, rx_Bitset, rx_Bitset); -RX_DECL void rx_bitset_intersection(int, rx_Bitset, rx_Bitset); -RX_DECL void rx_bitset_difference(int, rx_Bitset, rx_Bitset); - -//RX_DECL void rx_bitset_revdifference (int, rx_Bitset, rx_Bitset); -#ifdef emacs -RX_DECL void rx_bitset_xor(int, rx_Bitset, rx_Bitset); -#endif -RX_DECL unsigned long rx_bitset_hash(int, rx_Bitset); - -#if 0 -#ifdef __STDC__ -RX_DECL int rx_bitset_is_equal(int size, rx_Bitset a, rx_Bitset b) -#else -RX_DECL int rx_bitset_is_equal(size, a, b) -int size; -rx_Bitset a; -rx_Bitset b; -#endif -{ - int x; - RX_subset s = b[0]; - - b[0] = ~a[0]; - - for (x = rx_bitset_numb_subsets(size) - 1; a[x] == b[x]; --x); - - b[0] = s; - return !x && s == a[0]; -} -#endif - -#ifdef __STDC__ -RX_DECL int rx_bitset_is_subset(int size, rx_Bitset a, rx_Bitset b) -#else -RX_DECL int rx_bitset_is_subset(size, a, b) -int size; -rx_Bitset a; -rx_Bitset b; -#endif -{ - int x = rx_bitset_numb_subsets(size) - 1; - - while (x-- && (a[x] & b[x]) == a[x]); - return x == -1; -} - -#if 0 -#ifdef __STDC__ -RX_DECL int rx_bitset_empty(int size, rx_Bitset set) -#else -RX_DECL int rx_bitset_empty(size, set) -int size; -rx_Bitset set; -#endif -{ - int x; - RX_subset s = set[0]; - - set[0] = 1; - for (x = rx_bitset_numb_subsets(size) - 1; !set[x]; --x); - set[0] = s; - return !s; -} -#endif - -#ifdef __STDC__ -RX_DECL void rx_bitset_null(int size, rx_Bitset b) -#else -RX_DECL void rx_bitset_null(size, b) -int size; -rx_Bitset b; -#endif -{ - bzero(b, rx_sizeof_bitset(size)); -} - - -#ifdef __STDC__ -RX_DECL void rx_bitset_universe(int size, rx_Bitset b) -#else -RX_DECL void rx_bitset_universe(size, b) -int size; -rx_Bitset b; -#endif -{ - int x = rx_bitset_numb_subsets(size); - - while (x--) - *b++ = ~(RX_subset) 0; -} - - -#ifdef __STDC__ -RX_DECL void rx_bitset_complement(int size, rx_Bitset b) -#else -RX_DECL void rx_bitset_complement(size, b) -int size; -rx_Bitset b; -#endif -{ - int x = rx_bitset_numb_subsets(size); - - while (x--) { - *b = ~*b; - ++b; - } -} - - -#ifdef __STDC__ -RX_DECL void rx_bitset_assign(int size, rx_Bitset a, rx_Bitset b) -#else -RX_DECL void rx_bitset_assign(size, a, b) -int size; -rx_Bitset a; -rx_Bitset b; -#endif -{ - int x; - - for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x) - a[x] = b[x]; -} - -#ifdef __STDC__ -RX_DECL void rx_bitset_union(int size, rx_Bitset a, rx_Bitset b) -#else -RX_DECL void rx_bitset_union(size, a, b) -int size; -rx_Bitset a; -rx_Bitset b; -#endif -{ - int x; - - for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x) - a[x] |= b[x]; -} - - -#ifdef __STDC__ -RX_DECL void rx_bitset_intersection(int size, rx_Bitset a, rx_Bitset b) -#else -RX_DECL void rx_bitset_intersection(size, a, b) -int size; -rx_Bitset a; -rx_Bitset b; -#endif -{ - int x; - - for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x) - a[x] &= b[x]; -} - - -#ifdef __STDC__ -RX_DECL void rx_bitset_difference(int size, rx_Bitset a, rx_Bitset b) -#else -RX_DECL void rx_bitset_difference(size, a, b) -int size; -rx_Bitset a; -rx_Bitset b; -#endif -{ - int x; - - for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x) - a[x] &= ~b[x]; -} - - -#if 0 -#ifdef __STDC__ -RX_DECL void rx_bitset_revdifference(int size, rx_Bitset a, rx_Bitset b) -#else -RX_DECL void rx_bitset_revdifference(size, a, b) -int size; -rx_Bitset a; -rx_Bitset b; -#endif -{ - int x; - - for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x) - a[x] = ~a[x] & b[x]; -} -#endif - - -#ifdef emacs -#ifdef __STDC__ -RX_DECL void rx_bitset_xor(int size, rx_Bitset a, rx_Bitset b) -#else -RX_DECL void rx_bitset_xor(size, a, b) -int size; -rx_Bitset a; -rx_Bitset b; -#endif -{ - int x; - - for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x) - a[x] ^= b[x]; -} -#endif - - -#ifdef __STDC__ -RX_DECL unsigned long rx_bitset_hash(int size, rx_Bitset b) -#else -RX_DECL unsigned long rx_bitset_hash(size, b) -int size; -rx_Bitset b; -#endif -{ - int x; - unsigned long hash = (unsigned long) rx_bitset_hash; - - for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x) - hash ^= rx_bitset_subset_val(b, x); - - return hash; -} - -RX_DECL RX_subset rx_subset_singletons[RX_subset_bits] = { - 0x1, - 0x2, - 0x4, - 0x8, - 0x10, - 0x20, - 0x40, - 0x80, - 0x100, - 0x200, - 0x400, - 0x800, - 0x1000, - 0x2000, - 0x4000, - 0x8000, - 0x10000, - 0x20000, - 0x40000, - 0x80000, - 0x100000, - 0x200000, - 0x400000, - 0x800000, - 0x1000000, - 0x2000000, - 0x4000000, - 0x8000000, - 0x10000000, - 0x20000000, - 0x40000000, - 0x80000000 -}; - -#ifdef RX_DEBUG - -#ifdef __STDC__ -static void print_cset(struct rx *rx, rx_Bitset cset, FILE * fp) -#else -static void print_cset(rx, cset, fp) -struct rx *rx; -rx_Bitset cset; -FILE *fp; -#endif -{ - int x; - - fputc('[', fp); - for (x = 0; x < rx->local_cset_size; ++x) - if (RX_bitset_member(cset, x)) { - if (isprint(x)) - fputc(x, fp); - else - fprintf(fp, "\\0%o ", x); - } - fputc(']', fp); -} - -#endif /* RX_DEBUG */ - - - -static unsigned long rx_hash_masks[4] = { - 0x12488421, - 0x96699669, - 0xbe7dd7eb, - 0xffffffff -}; - - -/* Hash tables */ -#ifdef __STDC__ -RX_DECL struct rx_hash_item *rx_hash_find(struct rx_hash *table, - unsigned long hash, - void *value, - struct rx_hash_rules *rules) -#else -RX_DECL struct rx_hash_item *rx_hash_find(table, hash, value, rules) -struct rx_hash *table; -unsigned long hash; -void *value; -struct rx_hash_rules *rules; -#endif -{ - rx_hash_eq eq = rules->eq; - int maskc = 0; - long mask = rx_hash_masks[0]; - int bucket = (hash & mask) % 13; - - while (table->children[bucket]) { - table = table->children[bucket]; - ++maskc; - mask = rx_hash_masks[maskc]; - bucket = (hash & mask) % 13; - } - - { - struct rx_hash_item *it = table->buckets[bucket]; - - while (it) - if (eq(it->data, value)) - return it; - else - it = it->next_same_hash; - } - - return 0; -} - -#ifdef __STDC__ -RX_DECL struct rx_hash_item *rx_hash_store(struct rx_hash *table, - unsigned long hash, - void *value, - struct rx_hash_rules *rules) -#else -RX_DECL struct rx_hash_item *rx_hash_store(table, hash, value, rules) -struct rx_hash *table; -unsigned long hash; -void *value; -struct rx_hash_rules *rules; -#endif -{ - rx_hash_eq eq = rules->eq; - int maskc = 0; - long mask = rx_hash_masks[0]; - int bucket = (hash & mask) % 13; - int depth = 0; - - while (table->children[bucket]) { - table = table->children[bucket]; - ++maskc; - mask = rx_hash_masks[maskc]; - bucket = (hash & mask) % 13; - ++depth; - } - - { - struct rx_hash_item *it = table->buckets[bucket]; - - while (it) - if (eq(it->data, value)) - return it; - else - it = it->next_same_hash; - } - - { - if ((depth < 3) - && (table->bucket_size[bucket] >= 4)) { - struct rx_hash *newtab = ((struct rx_hash *) - rules->hash_alloc(rules)); - - if (!newtab) - goto add_to_bucket; - bzero(newtab, sizeof(*newtab)); - newtab->parent = table; - { - struct rx_hash_item *them = table->buckets[bucket]; - unsigned long newmask = rx_hash_masks[maskc + 1]; - - while (them) { - struct rx_hash_item *save = them->next_same_hash; - int new_buck = (them->hash & newmask) % 13; - - them->next_same_hash = newtab->buckets[new_buck]; - newtab->buckets[new_buck] = them; - them->table = newtab; - them = save; - ++newtab->bucket_size[new_buck]; - ++newtab->refs; - } - table->refs = - (table->refs - table->bucket_size[bucket] + 1); - table->bucket_size[bucket] = 0; - table->buckets[bucket] = 0; - table->children[bucket] = newtab; - table = newtab; - bucket = (hash & newmask) % 13; - } - } - } - add_to_bucket: - { - struct rx_hash_item *it = ((struct rx_hash_item *) - rules->hash_item_alloc(rules, value)); - - if (!it) - return 0; - it->hash = hash; - it->table = table; - /* DATA and BINDING are to be set in hash_item_alloc */ - it->next_same_hash = table->buckets[bucket]; - table->buckets[bucket] = it; - ++table->bucket_size[bucket]; - ++table->refs; - return it; - } -} - - -#ifdef __STDC__ -RX_DECL void -rx_hash_free(struct rx_hash_item *it, struct rx_hash_rules *rules) -#else -RX_DECL void rx_hash_free(it, rules) -struct rx_hash_item *it; -struct rx_hash_rules *rules; -#endif -{ - if (it) { - struct rx_hash *table = it->table; - unsigned long hash = it->hash; - int depth = (table->parent - ? (table->parent->parent - ? (table->parent->parent->parent ? 3 : 2) - : 1) - : 0); - int bucket = (hash & rx_hash_masks[depth]) % 13; - struct rx_hash_item **pos = &table->buckets[bucket]; - - while (*pos != it) - pos = &(*pos)->next_same_hash; - *pos = it->next_same_hash; - rules->free_hash_item(it, rules); - --table->bucket_size[bucket]; - --table->refs; - while (!table->refs && depth) { - struct rx_hash *save = table; - - table = table->parent; - --depth; - bucket = (hash & rx_hash_masks[depth]) % 13; - --table->refs; - table->children[bucket] = 0; - rules->free_hash(save, rules); - } - } -} - -#ifdef __STDC__ -RX_DECL void -rx_free_hash_table(struct rx_hash *tab, rx_hash_freefn freefn, - struct rx_hash_rules *rules) -#else -RX_DECL void rx_free_hash_table(tab, freefn, rules) -struct rx_hash *tab; -rx_hash_freefn freefn; -struct rx_hash_rules *rules; -#endif -{ - int x; - - for (x = 0; x < 13; ++x) - if (tab->children[x]) { - rx_free_hash_table(tab->children[x], freefn, rules); - rules->free_hash(tab->children[x], rules); - } else { - struct rx_hash_item *them = tab->buckets[x]; - - while (them) { - struct rx_hash_item *that = them; - - them = that->next_same_hash; - freefn(that); - rules->free_hash_item(that, rules); - } - } -} - - - -/* Utilities for manipulating bitset represntations of characters sets. */ - -#ifdef __STDC__ -RX_DECL rx_Bitset rx_cset(struct rx *rx) -#else -RX_DECL rx_Bitset rx_cset(rx) -struct rx *rx; -#endif -{ - rx_Bitset b = - - (rx_Bitset) malloc(rx_sizeof_bitset(rx->local_cset_size)); - if (b) - rx_bitset_null(rx->local_cset_size, b); - return b; -} - - -#ifdef __STDC__ -RX_DECL rx_Bitset rx_copy_cset(struct rx * rx, rx_Bitset a) -#else -RX_DECL rx_Bitset rx_copy_cset(rx, a) -struct rx *rx; -rx_Bitset a; -#endif -{ - rx_Bitset cs = rx_cset(rx); - - if (cs) - rx_bitset_union(rx->local_cset_size, cs, a); - - return cs; -} - - -#ifdef __STDC__ -RX_DECL void rx_free_cset(struct rx *rx, rx_Bitset c) -#else -RX_DECL void rx_free_cset(rx, c) -struct rx *rx; -rx_Bitset c; -#endif -{ - if (c) - free((char *) c); -} - - -/* Hash table memory allocation policy for the regexp compiler */ - -#ifdef __STDC__ -static struct rx_hash *compiler_hash_alloc(struct rx_hash_rules *rules) -#else -static struct rx_hash *compiler_hash_alloc(rules) -struct rx_hash_rules *rules; -#endif -{ - return (struct rx_hash *) malloc(sizeof(struct rx_hash)); -} - - -#ifdef __STDC__ -static struct rx_hash_item *compiler_hash_item_alloc(struct rx_hash_rules - *rules, void *value) -#else -static struct rx_hash_item *compiler_hash_item_alloc(rules, value) -struct rx_hash_rules *rules; -void *value; -#endif -{ - struct rx_hash_item *it; - - it = (struct rx_hash_item *) malloc(sizeof(*it)); - if (it) { - it->data = value; - it->binding = 0; - } - return it; -} - -#ifdef __STDC__ -static void -compiler_free_hash(struct rx_hash *tab, struct rx_hash_rules *rules) -#else -static void compiler_free_hash(tab, rules) -struct rx_hash *tab; -struct rx_hash_rules *rules; -#endif -{ - free((char *) tab); -} - - -#ifdef __STDC__ -static void -compiler_free_hash_item(struct rx_hash_item *item, - struct rx_hash_rules *rules) -#else -static void compiler_free_hash_item(item, rules) -struct rx_hash_item *item; -struct rx_hash_rules *rules; -#endif -{ - free((char *) item); -} - - -/* This page: REXP_NODE (expression tree) structures. */ - -#ifdef __STDC__ -RX_DECL struct rexp_node *rexp_node(struct rx *rx, - enum rexp_node_type type) -#else -RX_DECL struct rexp_node *rexp_node(rx, type) -struct rx *rx; -enum rexp_node_type type; -#endif -{ - struct rexp_node *n; - - n = (struct rexp_node *) malloc(sizeof(*n)); - if (n) { - bzero(n, sizeof(*n)); - n->type = type; - } - return n; -} - - -/* free_rexp_node assumes that the bitset passed to rx_mk_r_cset - * can be freed using rx_free_cset. - */ -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_mk_r_cset(struct rx *rx, rx_Bitset b) -#else -RX_DECL struct rexp_node *rx_mk_r_cset(rx, b) -struct rx *rx; -rx_Bitset b; -#endif -{ - struct rexp_node *n = rexp_node(rx, r_cset); - - if (n) - n->params.cset = b; - return n; -} - -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_mk_r_concat(struct rx *rx, - struct rexp_node *a, - struct rexp_node *b) -#else -RX_DECL struct rexp_node *rx_mk_r_concat(rx, a, b) -struct rx *rx; -struct rexp_node *a; -struct rexp_node *b; -#endif -{ - struct rexp_node *n = rexp_node(rx, r_concat); - - if (n) { - n->params.pair.left = a; - n->params.pair.right = b; - } - return n; -} - - -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_mk_r_alternate(struct rx *rx, - struct rexp_node *a, - struct rexp_node *b) -#else -RX_DECL struct rexp_node *rx_mk_r_alternate(rx, a, b) -struct rx *rx; -struct rexp_node *a; -struct rexp_node *b; -#endif -{ - struct rexp_node *n = rexp_node(rx, r_alternate); - - if (n) { - n->params.pair.left = a; - n->params.pair.right = b; - } - return n; -} - - -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_mk_r_opt(struct rx *rx, struct rexp_node *a) -#else -RX_DECL struct rexp_node *rx_mk_r_opt(rx, a) -struct rx *rx; -struct rexp_node *a; -#endif -{ - struct rexp_node *n = rexp_node(rx, r_opt); - - if (n) { - n->params.pair.left = a; - n->params.pair.right = 0; - } - return n; -} - -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_mk_r_star(struct rx *rx, struct rexp_node *a) -#else -RX_DECL struct rexp_node *rx_mk_r_star(rx, a) -struct rx *rx; -struct rexp_node *a; -#endif -{ - struct rexp_node *n = rexp_node(rx, r_star); - - if (n) { - n->params.pair.left = a; - n->params.pair.right = 0; - } - return n; -} - - -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_mk_r_2phase_star(struct rx *rx, - struct rexp_node *a, - struct rexp_node *b) -#else -RX_DECL struct rexp_node *rx_mk_r_2phase_star(rx, a, b) -struct rx *rx; -struct rexp_node *a; -struct rexp_node *b; -#endif -{ - struct rexp_node *n = rexp_node(rx, r_2phase_star); - - if (n) { - n->params.pair.left = a; - n->params.pair.right = b; - } - return n; -} - -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_mk_r_side_effect(struct rx *rx, - rx_side_effect a) -#else -RX_DECL struct rexp_node *rx_mk_r_side_effect(rx, a) -struct rx *rx; -rx_side_effect a; -#endif -{ - struct rexp_node *n = rexp_node(rx, r_side_effect); - - if (n) { - n->params.side_effect = a; - n->params.pair.right = 0; - } - return n; -} - - -#if 0 -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_mk_r_data(struct rx *rx, void *a) -#else -RX_DECL struct rexp_node *rx_mk_r_data(rx, a) -struct rx *rx; -void *a; -#endif -{ - struct rexp_node *n = rexp_node(rx, r_data); - - if (n) { - n->params.pair.left = a; - n->params.pair.right = 0; - } - return n; -} -#endif - -#ifdef __STDC__ -RX_DECL void rx_free_rexp(struct rx *rx, struct rexp_node *node) -#else -RX_DECL void rx_free_rexp(rx, node) -struct rx *rx; -struct rexp_node *node; -#endif -{ - if (node) { - switch (node->type) { - case r_cset: - if (node->params.cset) - rx_free_cset(rx, node->params.cset); - - case r_side_effect: - break; - - case r_concat: - case r_alternate: - case r_2phase_star: - case r_opt: - case r_star: - rx_free_rexp(rx, node->params.pair.left); - rx_free_rexp(rx, node->params.pair.right); - break; - - case r_data: - /* This shouldn't occur. */ - break; - } - free((char *) node); - } -} - -#ifdef __STDC__ -RX_DECL struct rexp_node *rx_copy_rexp(struct rx *rx, - struct rexp_node *node) -#else -RX_DECL struct rexp_node *rx_copy_rexp(rx, node) -struct rx *rx; -struct rexp_node *node; -#endif -{ - if (!node) - return 0; - else { - struct rexp_node *n = rexp_node(rx, node->type); - - if (!n) - return 0; - switch (node->type) { - case r_cset: - n->params.cset = rx_copy_cset(rx, node->params.cset); - if (!n->params.cset) { - rx_free_rexp(rx, n); - return 0; - } - break; - - case r_side_effect: - n->params.side_effect = node->params.side_effect; - break; - - case r_concat: - case r_alternate: - case r_opt: - case r_2phase_star: - case r_star: - n->params.pair.left = rx_copy_rexp(rx, node->params.pair.left); - n->params.pair.right = - rx_copy_rexp(rx, node->params.pair.right); - if ((node->params.pair.left && !n->params.pair.left) - || (node->params.pair.right && !n->params.pair.right)) { - rx_free_rexp(rx, n); - return 0; - } - break; - case r_data: - /* shouldn't happen */ - break; - } - return n; - } -} - - - -/* This page: functions to build and destroy graphs that describe nfa's */ - -/* Constructs a new nfa node. */ -#ifdef __STDC__ -RX_DECL struct rx_nfa_state *rx_nfa_state(struct rx *rx) -#else -RX_DECL struct rx_nfa_state *rx_nfa_state(rx) -struct rx *rx; -#endif -{ - struct rx_nfa_state *n = (struct rx_nfa_state *) malloc(sizeof(*n)); - - if (!n) - return 0; - bzero(n, sizeof(*n)); - n->next = rx->nfa_states; - rx->nfa_states = n; - return n; -} - - -#ifdef __STDC__ -RX_DECL void rx_free_nfa_state(struct rx_nfa_state *n) -#else -RX_DECL void rx_free_nfa_state(n) -struct rx_nfa_state *n; -#endif -{ - free((char *) n); -} - - -/* This looks up an nfa node, given a numeric id. Numeric id's are - * assigned after the nfa has been built. - */ -#ifdef __STDC__ -RX_DECL struct rx_nfa_state *rx_id_to_nfa_state(struct rx *rx, int id) -#else -RX_DECL struct rx_nfa_state *rx_id_to_nfa_state(rx, id) -struct rx *rx; -int id; -#endif -{ - struct rx_nfa_state *n; - - for (n = rx->nfa_states; n; n = n->next) - if (n->id == id) - return n; - return 0; -} - - -/* This adds an edge between two nodes, but doesn't initialize the - * edge label. - */ - -#ifdef __STDC__ -RX_DECL struct rx_nfa_edge *rx_nfa_edge(struct rx *rx, - enum rx_nfa_etype type, - struct rx_nfa_state *start, - struct rx_nfa_state *dest) -#else -RX_DECL struct rx_nfa_edge *rx_nfa_edge(rx, type, start, dest) -struct rx *rx; -enum rx_nfa_etype type; -struct rx_nfa_state *start; -struct rx_nfa_state *dest; -#endif -{ - struct rx_nfa_edge *e; - - e = (struct rx_nfa_edge *) malloc(sizeof(*e)); - if (!e) - return 0; - e->next = start->edges; - start->edges = e; - e->type = type; - e->dest = dest; - return e; -} - - -#ifdef __STDC__ -RX_DECL void rx_free_nfa_edge(struct rx_nfa_edge *e) -#else -RX_DECL void rx_free_nfa_edge(e) -struct rx_nfa_edge *e; -#endif -{ - free((char *) e); -} - - -/* This constructs a POSSIBLE_FUTURE, which is a kind epsilon-closure - * of an NFA. These are added to an nfa automaticly by eclose_nfa. - */ - -#ifdef __STDC__ -static struct rx_possible_future *rx_possible_future(struct rx *rx, struct rx_se_list - *effects) -#else -static struct rx_possible_future *rx_possible_future(rx, effects) -struct rx *rx; -struct rx_se_list *effects; -#endif -{ - struct rx_possible_future *ec; - - ec = (struct rx_possible_future *) malloc(sizeof(*ec)); - if (!ec) - return 0; - ec->destset = 0; - ec->next = 0; - ec->effects = effects; - return ec; -} - - -#ifdef __STDC__ -static void rx_free_possible_future(struct rx_possible_future *pf) -#else -static void rx_free_possible_future(pf) -struct rx_possible_future *pf; -#endif -{ - free((char *) pf); -} - - -#ifdef __STDC__ -RX_DECL void rx_free_nfa(struct rx *rx) -#else -RX_DECL void rx_free_nfa(rx) -struct rx *rx; -#endif -{ - while (rx->nfa_states) { - while (rx->nfa_states->edges) { - switch (rx->nfa_states->edges->type) { - case ne_cset: - rx_free_cset(rx, rx->nfa_states->edges->params.cset); - break; - default: - break; - } - { - struct rx_nfa_edge *e; - - e = rx->nfa_states->edges; - rx->nfa_states->edges = rx->nfa_states->edges->next; - rx_free_nfa_edge(e); - } - } /* while (rx->nfa_states->edges) */ - { - /* Iterate over the partial epsilon closures of rx->nfa_states */ - struct rx_possible_future *pf = rx->nfa_states->futures; - - while (pf) { - struct rx_possible_future *pft = pf; - - pf = pf->next; - rx_free_possible_future(pft); - } - } - { - struct rx_nfa_state *n; - - n = rx->nfa_states; - rx->nfa_states = rx->nfa_states->next; - rx_free_nfa_state(n); - } - } -} - - - -/* This page: translating a pattern expression into an nfa and doing the - * static part of the nfa->super-nfa translation. - */ - -/* This is the thompson regexp->nfa algorithm. - * It is modified to allow for `side-effect epsilons.' Those are - * edges that are taken whenever a similar epsilon edge would be, - * but which imply that some side effect occurs when the edge - * is taken. - * - * Side effects are used to model parts of the pattern langauge - * that are not regular (in the formal sense). - */ - -#ifdef __STDC__ -RX_DECL int -rx_build_nfa(struct rx *rx, - struct rexp_node *rexp, - struct rx_nfa_state **start, struct rx_nfa_state **end) -#else -RX_DECL int rx_build_nfa(rx, rexp, start, end) -struct rx *rx; -struct rexp_node *rexp; -struct rx_nfa_state **start; -struct rx_nfa_state **end; -#endif -{ - struct rx_nfa_edge *edge; - - /* Start & end nodes may have been allocated by the caller. */ - *start = *start ? *start : rx_nfa_state(rx); - - if (!*start) - return 0; - - if (!rexp) { - *end = *start; - return 1; - } - - *end = *end ? *end : rx_nfa_state(rx); - - if (!*end) { - rx_free_nfa_state(*start); - return 0; - } - - switch (rexp->type) { - case r_data: - return 0; - - case r_cset: - edge = rx_nfa_edge(rx, ne_cset, *start, *end); - if (!edge) - return 0; - edge->params.cset = rx_copy_cset(rx, rexp->params.cset); - if (!edge->params.cset) { - rx_free_nfa_edge(edge); - return 0; - } - return 1; - - case r_opt: - return (rx_build_nfa(rx, rexp->params.pair.left, start, end) - && rx_nfa_edge(rx, ne_epsilon, *start, *end)); - - case r_star: - { - struct rx_nfa_state *star_start = 0; - struct rx_nfa_state *star_end = 0; - - return (rx_build_nfa(rx, rexp->params.pair.left, - &star_start, &star_end) - && star_start - && star_end - && rx_nfa_edge(rx, ne_epsilon, star_start, star_end) - && rx_nfa_edge(rx, ne_epsilon, *start, star_start) - && rx_nfa_edge(rx, ne_epsilon, star_end, *end) - - && rx_nfa_edge(rx, ne_epsilon, star_end, star_start)); - } - - case r_2phase_star: - { - struct rx_nfa_state *star_start = 0; - struct rx_nfa_state *star_end = 0; - struct rx_nfa_state *loop_exp_start = 0; - struct rx_nfa_state *loop_exp_end = 0; - - return (rx_build_nfa(rx, rexp->params.pair.left, - &star_start, &star_end) - && rx_build_nfa(rx, rexp->params.pair.right, - &loop_exp_start, &loop_exp_end) - && star_start - && star_end - && loop_exp_end - && loop_exp_start - && rx_nfa_edge(rx, ne_epsilon, star_start, *end) - && rx_nfa_edge(rx, ne_epsilon, *start, star_start) - && rx_nfa_edge(rx, ne_epsilon, star_end, *end) - - && rx_nfa_edge(rx, ne_epsilon, star_end, loop_exp_start) - && rx_nfa_edge(rx, ne_epsilon, loop_exp_end, star_start)); - } - - - case r_concat: - { - struct rx_nfa_state *shared = 0; - - return (rx_build_nfa(rx, rexp->params.pair.left, start, &shared) - && rx_build_nfa(rx, rexp->params.pair.right, &shared, - end)); - } - - case r_alternate: - { - struct rx_nfa_state *ls = 0; - struct rx_nfa_state *le = 0; - struct rx_nfa_state *rs = 0; - struct rx_nfa_state *re = 0; - - return (rx_build_nfa(rx, rexp->params.pair.left, &ls, &le) - && rx_build_nfa(rx, rexp->params.pair.right, &rs, &re) - && rx_nfa_edge(rx, ne_epsilon, *start, ls) - && rx_nfa_edge(rx, ne_epsilon, *start, rs) - && rx_nfa_edge(rx, ne_epsilon, le, *end) - && rx_nfa_edge(rx, ne_epsilon, re, *end)); - } - - case r_side_effect: - edge = rx_nfa_edge(rx, ne_side_effect, *start, *end); - if (!edge) - return 0; - edge->params.side_effect = rexp->params.side_effect; - return 1; - } - - /* this should never happen */ - return 0; -} - - -/* RX_NAME_NFA_STATES identifies all nodes with outgoing non-epsilon - * transitions. Only these nodes can occur in super-states. - * All nodes are given an integer id. - * The id is non-negative if the node has non-epsilon out-transitions, negative - * otherwise (this is because we want the non-negative ids to be used as - * array indexes in a few places). - */ - -#ifdef __STDC__ -RX_DECL void rx_name_nfa_states(struct rx *rx) -#else -RX_DECL void rx_name_nfa_states(rx) -struct rx *rx; -#endif -{ - struct rx_nfa_state *n = rx->nfa_states; - - rx->nodec = 0; - rx->epsnodec = -1; - - while (n) { - struct rx_nfa_edge *e = n->edges; - - if (n->is_start) - n->eclosure_needed = 1; - - while (e) { - switch (e->type) { - case ne_epsilon: - case ne_side_effect: - break; - - case ne_cset: - n->id = rx->nodec++; - { - struct rx_nfa_edge *from_n = n->edges; - - while (from_n) { - from_n->dest->eclosure_needed = 1; - from_n = from_n->next; - } - } - goto cont; - } - e = e->next; - } - n->id = rx->epsnodec--; - cont: - n = n->next; - } - rx->epsnodec = -rx->epsnodec; -} - - -/* This page: data structures for the static part of the nfa->supernfa - * translation. - * - * There are side effect lists -- lists of side effects occuring - * along an uninterrupted, acyclic path of side-effect epsilon edges. - * Such paths are collapsed to single edges in the course of computing - * epsilon closures. Such single edges are labled with a list of all - * the side effects entailed in crossing them. Like lists of side - * effects are made == by the constructors below. - * - * There are also nfa state sets. These are used to hold a list of all - * states reachable from a starting state for a given type of transition - * and side effect list. These are also hash-consed. - */ - -/* The next several functions compare, construct, etc. lists of side - * effects. See ECLOSE_NFA (below) for details. - */ - -/* Ordering of rx_se_list - * (-1, 0, 1 return value convention). - */ - -#ifdef __STDC__ -static int se_list_cmp(void *va, void *vb) -#else -static int se_list_cmp(va, vb) -void *va; -void *vb; -#endif -{ - struct rx_se_list *a = (struct rx_se_list *) va; - struct rx_se_list *b = (struct rx_se_list *) vb; - - return ((va == vb) - ? 0 - : (!va - ? -1 - : (!vb - ? 1 - : ((long) a->car < (long) b->car - ? 1 - : ((long) a->car > (long) b->car - ? -1 - : se_list_cmp((void *) a->cdr, - (void *) b->cdr)))))); -} - - -#ifdef __STDC__ -static int se_list_equal(void *va, void *vb) -#else -static int se_list_equal(va, vb) -void *va; -void *vb; -#endif -{ - return !(se_list_cmp(va, vb)); -} - -static struct rx_hash_rules se_list_hash_rules = { - se_list_equal, - compiler_hash_alloc, - compiler_free_hash, - compiler_hash_item_alloc, - compiler_free_hash_item -}; - - -#ifdef __STDC__ -static struct rx_se_list *side_effect_cons(struct rx *rx, - void *se, - struct rx_se_list *list) -#else -static struct rx_se_list *side_effect_cons(rx, se, list) -struct rx *rx; -void *se; -struct rx_se_list *list; -#endif -{ - struct rx_se_list *l; - - l = ((struct rx_se_list *) malloc(sizeof(*l))); - if (!l) - return 0; - l->car = se; - l->cdr = list; - return l; -} - - -#ifdef __STDC__ -static struct rx_se_list *hash_cons_se_prog(struct rx *rx, - struct rx_hash *memo, - void *car, - struct rx_se_list *cdr) -#else -static struct rx_se_list *hash_cons_se_prog(rx, memo, car, cdr) -struct rx *rx; -struct rx_hash *memo; -void *car; -struct rx_se_list *cdr; -#endif -{ - long hash = (long) car ^ (long) cdr; - struct rx_se_list template; - - template.car = car; - template.cdr = cdr; - { - struct rx_hash_item *it = rx_hash_store(memo, hash, - (void *) &template, - &se_list_hash_rules); - - if (!it) - return 0; - if (it->data == (void *) &template) { - struct rx_se_list *consed; - - consed = (struct rx_se_list *) malloc(sizeof(*consed)); - if (!consed) { - free((char *) it); - return 0; - } - *consed = template; - it->data = (void *) consed; - } - return (struct rx_se_list *) it->data; - } -} - - -#ifdef __STDC__ -static struct rx_se_list *hash_se_prog(struct rx *rx, struct rx_hash *memo, - struct rx_se_list *prog) -#else -static struct rx_se_list *hash_se_prog(rx, memo, prog) -struct rx *rx; -struct rx_hash *memo; -struct rx_se_list *prog; -#endif -{ - struct rx_se_list *answer = 0; - - while (prog) { - answer = hash_cons_se_prog(rx, memo, prog->car, answer); - if (!answer) - return 0; - prog = prog->cdr; - } - return answer; -} - -#ifdef __STDC__ -static int nfa_set_cmp(void *va, void *vb) -#else -static int nfa_set_cmp(va, vb) -void *va; -void *vb; -#endif -{ - struct rx_nfa_state_set *a = (struct rx_nfa_state_set *) va; - struct rx_nfa_state_set *b = (struct rx_nfa_state_set *) vb; - - return ((va == vb) - ? 0 - : (!va - ? -1 - : (!vb - ? 1 - : (a->car->id < b->car->id - ? 1 - : (a->car->id > b->car->id - ? -1 - : nfa_set_cmp((void *) a->cdr, - (void *) b->cdr)))))); -} - -#ifdef __STDC__ -static int nfa_set_equal(void *va, void *vb) -#else -static int nfa_set_equal(va, vb) -void *va; -void *vb; -#endif -{ - return !nfa_set_cmp(va, vb); -} - -static struct rx_hash_rules nfa_set_hash_rules = { - nfa_set_equal, - compiler_hash_alloc, - compiler_free_hash, - compiler_hash_item_alloc, - compiler_free_hash_item -}; - - -#ifdef __STDC__ -static struct rx_nfa_state_set *nfa_set_cons(struct rx *rx, - struct rx_hash *memo, - struct rx_nfa_state *state, - struct rx_nfa_state_set *set) -#else -static struct rx_nfa_state_set *nfa_set_cons(rx, memo, state, set) -struct rx *rx; -struct rx_hash *memo; -struct rx_nfa_state *state; -struct rx_nfa_state_set *set; -#endif -{ - struct rx_nfa_state_set template; - struct rx_hash_item *node; - - template.car = state; - template.cdr = set; - node = rx_hash_store(memo, - (((long) state) >> 8) ^ (long) set, - &template, &nfa_set_hash_rules); - if (!node) - return 0; - if (node->data == &template) { - struct rx_nfa_state_set *l; - - l = (struct rx_nfa_state_set *) malloc(sizeof(*l)); - node->data = (void *) l; - if (!l) - return 0; - *l = template; - } - return (struct rx_nfa_state_set *) node->data; -} - -#ifdef __STDC__ -static struct rx_nfa_state_set *nfa_set_enjoin(struct rx *rx, - struct rx_hash *memo, - struct rx_nfa_state *state, - struct rx_nfa_state_set - *set) -#else -static struct rx_nfa_state_set *nfa_set_enjoin(rx, memo, state, set) -struct rx *rx; -struct rx_hash *memo; -struct rx_nfa_state *state; -struct rx_nfa_state_set *set; -#endif -{ - if (!set || state->id < set->car->id) - return nfa_set_cons(rx, memo, state, set); - if (state->id == set->car->id) - return set; - else { - struct rx_nfa_state_set *newcdr - - = nfa_set_enjoin(rx, memo, state, set->cdr); - if (newcdr != set->cdr) - set = nfa_set_cons(rx, memo, set->car, newcdr); - return set; - } -} - - - -/* This page: computing epsilon closures. The closures aren't total. - * Each node's closures are partitioned according to the side effects entailed - * along the epsilon edges. Return true on success. - */ - -struct eclose_frame { - struct rx_se_list *prog_backwards; -}; -static int eclose_node(struct rx *, struct rx_nfa_state *, - struct rx_nfa_state *, struct eclose_frame *); -RX_DECL int rx_eclose_nfa(struct rx *); -RX_DECL void rx_delete_epsilon_transitions(struct rx *); -static int nfacmp(void *, void *); -static int count_hash_nodes(struct rx_hash *); -static void nfa_set_freer(struct rx_hash_item *); -RX_DECL int rx_compactify_nfa(struct rx *, void **, unsigned long *); -static char *rx_cache_malloc(struct rx_cache *, int); -static void rx_cache_free(struct rx_cache *, - - struct rx_freelist **, char *); -static void install_transition(struct rx_superstate *, - - struct rx_inx *, rx_Bitset); -static int qlen(struct rx_superstate *); -static void check_cache(struct rx_cache *); -static void semifree_superstate(struct rx_cache *); -static void refresh_semifree_superstate - - (struct rx_cache *, struct rx_superstate *); -static void rx_refresh_this_superstate - - (struct rx_cache *, struct rx_superstate *); -static void release_superset_low(struct rx_cache *, struct rx_superset *); -RX_DECL void rx_release_superset(struct rx *, struct rx_superset *); -static int rx_really_free_superstate(struct rx_cache *); -static char *rx_cache_get(struct rx_cache *, struct rx_freelist **); -static char *rx_cache_malloc_or_get(struct rx_cache *, - struct rx_freelist **, int); -static char *rx_cache_get_superstate(struct rx_cache *); -static int supersetcmp(void *, void *); -static struct rx_hash_item -*superset_allocator(struct rx_hash_rules *, void *); -static struct rx_hash -*super_hash_allocator(struct rx_hash_rules *); -static void super_hash_liberator(struct rx_hash *, struct rx_hash_rules *); -static void superset_hash_item_liberator - - (struct rx_hash_item *, struct rx_hash_rules *); -static int bytes_for_cache_size(int, int); -static void rx_morecore(struct rx_cache *); -RX_DECL struct rx_superset -*rx_superset_cons(struct rx *, struct rx_nfa_state *, - - struct rx_superset *); -RX_DECL struct rx_superset -*rx_superstate_eclosure_union - - (struct rx *, struct rx_superset *, struct rx_nfa_state_set *); -static struct rx_distinct_future -*include_futures(struct rx *, - struct rx_distinct_future *, - - struct rx_nfa_state *, struct rx_superstate *); -RX_DECL struct rx_superstate -*rx_superstate(struct rx *, struct rx_superset *); -static int solve_destination(struct rx *, struct rx_distinct_future *); -static int compute_super_edge(struct rx *, - struct rx_distinct_future **, - - rx_Bitset, struct rx_superstate *, - unsigned char); -static struct rx_super_edge -*rx_super_edge(struct rx *, struct rx_superstate *, - - rx_Bitset, struct rx_distinct_future *); -static void install_partial_transition - (struct rx_superstate *, struct rx_inx *, RX_subset, int); -RX_DECL struct rx_inx -*rx_handle_cache_miss(struct rx *, struct rx_superstate *, - - unsigned char, void *); -static boolean - -at_begline_loc_p(__const__ char *, __const__ char *, reg_syntax_t); -static boolean at_endline_loc_p(__const__ char *, __const__ char *, int); -static rx_Bitset -inverse_translation(struct re_pattern_buffer *, char *, - rx_Bitset, unsigned char *, int); - - -#ifdef __STDC__ -static int -eclose_node(struct rx *rx, struct rx_nfa_state *outnode, - struct rx_nfa_state *node, struct eclose_frame *frame) -#else -static int eclose_node(rx, outnode, node, frame) -struct rx *rx; -struct rx_nfa_state *outnode; -struct rx_nfa_state *node; -struct eclose_frame *frame; -#endif -{ - struct rx_nfa_edge *e = node->edges; - - /* For each node, we follow all epsilon paths to build the closure. - * The closure omits nodes that have only epsilon edges. - * The closure is split into partial closures -- all the states in - * a partial closure are reached by crossing the same list of - * of side effects (though not necessarily the same path). - */ - if (node->mark) - return 1; - node->mark = 1; - - if (node->id >= 0 || node->is_final) { - struct rx_possible_future **ec; - struct rx_se_list *prog_in_order - = ((struct rx_se_list *) hash_se_prog(rx, - &rx->se_list_memo, - frame->prog_backwards)); - int cmp; - - ec = &outnode->futures; - - while (*ec) { - cmp = - se_list_cmp((void *) (*ec)->effects, - (void *) prog_in_order); - if (cmp <= 0) - break; - ec = &(*ec)->next; - } - if (!*ec || (cmp < 0)) { - struct rx_possible_future *saved = *ec; - - *ec = rx_possible_future(rx, prog_in_order); - (*ec)->next = saved; - if (!*ec) - return 0; - } - if (node->id >= 0) { - (*ec)->destset = nfa_set_enjoin(rx, &rx->set_list_memo, - node, (*ec)->destset); - if (!(*ec)->destset) - return 0; - } - } - - while (e) { - switch (e->type) { - case ne_epsilon: - if (!eclose_node(rx, outnode, e->dest, frame)) - return 0; - break; - case ne_side_effect: - { - frame->prog_backwards = side_effect_cons(rx, - e->params.side_effect, - frame->prog_backwards); - if (!frame->prog_backwards) - return 0; - if (!eclose_node(rx, outnode, e->dest, frame)) - return 0; - { - struct rx_se_list *dying = frame->prog_backwards; - - frame->prog_backwards = frame->prog_backwards->cdr; - free((char *) dying); - } - break; - } - default: - break; - } - e = e->next; - } - node->mark = 0; - return 1; -} - -#ifdef __STDC__ -RX_DECL int rx_eclose_nfa(struct rx *rx) -#else -RX_DECL int rx_eclose_nfa(rx) -struct rx *rx; -#endif -{ - struct rx_nfa_state *n = rx->nfa_states; - struct eclose_frame frame; - static int rx_id = 0; - - frame.prog_backwards = 0; - rx->rx_id = rx_id++; - bzero(&rx->se_list_memo, sizeof(rx->se_list_memo)); - bzero(&rx->set_list_memo, sizeof(rx->set_list_memo)); - while (n) { - n->futures = 0; - if (n->eclosure_needed && !eclose_node(rx, n, n, &frame)) - return 0; - /* clear_marks (rx); */ - n = n->next; - } - return 1; -} - - -/* This deletes epsilon edges from an NFA. After running eclose_node, - * we have no more need for these edges. They are removed to simplify - * further operations on the NFA. - */ - -#ifdef __STDC__ -RX_DECL void rx_delete_epsilon_transitions(struct rx *rx) -#else -RX_DECL void rx_delete_epsilon_transitions(rx) -struct rx *rx; -#endif -{ - struct rx_nfa_state *n = rx->nfa_states; - struct rx_nfa_edge **e; - - while (n) { - e = &n->edges; - while (*e) { - struct rx_nfa_edge *t; - - switch ((*e)->type) { - case ne_epsilon: - case ne_side_effect: - t = *e; - *e = t->next; - rx_free_nfa_edge(t); - break; - - default: - e = &(*e)->next; - break; - } - } - n = n->next; - } -} - - -/* This page: storing the nfa in a contiguous region of memory for - * subsequent conversion to a super-nfa. - */ - -/* This is for qsort on an array of nfa_states. The order - * is based on state ids and goes - * [0...MAX][MIN..-1] where (MAX>=0) and (MIN<0) - * This way, positive ids double as array indices. - */ - -#ifdef __STDC__ -static int nfacmp(void *va, void *vb) -#else -static int nfacmp(va, vb) -void *va; -void *vb; -#endif -{ - struct rx_nfa_state **a = (struct rx_nfa_state **) va; - struct rx_nfa_state **b = (struct rx_nfa_state **) vb; - - return (*a == *b /* &&&& 3.18 */ - ? 0 : (((*a)->id < 0) == ((*b)->id < 0) - ? (((*a)->id < (*b)->id) ? -1 : 1) - : (((*a)->id < 0) - ? 1 : -1))); -} - -#ifdef __STDC__ -static int count_hash_nodes(struct rx_hash *st) -#else -static int count_hash_nodes(st) -struct rx_hash *st; -#endif -{ - int x; - int count = 0; - - for (x = 0; x < 13; ++x) - count += ((st->children[x]) - ? count_hash_nodes(st->children[x]) - : st->bucket_size[x]); - - return count; -} - - -#ifdef __STDC__ -static void se_memo_freer(struct rx_hash_item *node) -#else -static void se_memo_freer(node) -struct rx_hash_item *node; -#endif -{ - free((char *) node->data); -} - - -#ifdef __STDC__ -static void nfa_set_freer(struct rx_hash_item *node) -#else -static void nfa_set_freer(node) -struct rx_hash_item *node; -#endif -{ - free((char *) node->data); -} - - -/* This copies an entire NFA into a single malloced block of memory. - * Mostly this is for compatability with regex.c, though it is convenient - * to have the nfa nodes in an array. - */ - -#ifdef __STDC__ -RX_DECL int -rx_compactify_nfa(struct rx *rx, void **mem, unsigned long *size) -#else -RX_DECL int rx_compactify_nfa(rx, mem, size) -struct rx *rx; -void **mem; -unsigned long *size; -#endif -{ - int total_nodec; - struct rx_nfa_state *n; - int edgec = 0; - int eclosec = 0; - int se_list_consc = count_hash_nodes(&rx->se_list_memo); - int nfa_setc = count_hash_nodes(&rx->set_list_memo); - unsigned long total_size; - - /* This takes place in two stages. First, the total size of the - * nfa is computed, then structures are copied. - */ - n = rx->nfa_states; - total_nodec = 0; - while (n) { - struct rx_nfa_edge *e = n->edges; - struct rx_possible_future *ec = n->futures; - - ++total_nodec; - while (e) { - ++edgec; - e = e->next; - } - while (ec) { - ++eclosec; - ec = ec->next; - } - n = n->next; - } - - total_size = (total_nodec * sizeof(struct rx_nfa_state) - + edgec * rx_sizeof_bitset(rx->local_cset_size) - + edgec * sizeof(struct rx_nfa_edge) - + nfa_setc * sizeof(struct rx_nfa_state_set) - + eclosec * sizeof(struct rx_possible_future) - + se_list_consc * sizeof(struct rx_se_list) - + rx->reserved); - - if (total_size > *size) { - *mem = remalloc(*mem, total_size); - if (*mem) - *size = total_size; - else - return 0; - } - /* Now we've allocated the memory; this copies the NFA. */ - { - static struct rx_nfa_state **scratch = 0; - static int scratch_alloc = 0; - struct rx_nfa_state *state_base = (struct rx_nfa_state *) *mem; - struct rx_nfa_state *new_state = state_base; - struct rx_nfa_edge *new_edge = (struct rx_nfa_edge *) - ((char *) state_base + total_nodec * sizeof(struct rx_nfa_state)); - struct rx_se_list *new_se_list = (struct rx_se_list *) - ((char *) new_edge + edgec * sizeof(struct rx_nfa_edge)); - struct rx_possible_future *new_close = - ((struct rx_possible_future *) - ((char *) new_se_list - - + se_list_consc * sizeof(struct rx_se_list))); - struct rx_nfa_state_set *new_nfa_set = ((struct rx_nfa_state_set *) - - ((char *) new_close + - eclosec * - - sizeof(struct - rx_possible_future))); - char *new_bitset = - - ((char *) new_nfa_set + - nfa_setc * sizeof(struct rx_nfa_state_set)); - int x; - struct rx_nfa_state *n; - - if (scratch_alloc < total_nodec) { - scratch = ((struct rx_nfa_state **) - remalloc(scratch, total_nodec * sizeof(*scratch))); - if (scratch) - scratch_alloc = total_nodec; - else { - scratch_alloc = 0; - return 0; - } - } - - for (x = 0, n = rx->nfa_states; n; n = n->next) - scratch[x++] = n; - - qsort(scratch, total_nodec, sizeof(struct rx_nfa_state *), - (__compar_fn_t) nfacmp); - - for (x = 0; x < total_nodec; ++x) { - struct rx_possible_future *eclose = scratch[x]->futures; - struct rx_nfa_edge *edge = scratch[x]->edges; - struct rx_nfa_state *cn = new_state++; - - cn->futures = 0; - cn->edges = 0; - cn->next = (x == total_nodec - 1) ? 0 : (cn + 1); - cn->id = scratch[x]->id; - cn->is_final = scratch[x]->is_final; - cn->is_start = scratch[x]->is_start; - cn->mark = 0; - while (edge) { - int indx = (edge->dest->id < 0 - ? (total_nodec + edge->dest->id) - - : edge->dest->id); - struct rx_nfa_edge *e = new_edge++; - rx_Bitset cset = (rx_Bitset) new_bitset; - - new_bitset += rx_sizeof_bitset(rx->local_cset_size); - rx_bitset_null(rx->local_cset_size, cset); - rx_bitset_union(rx->local_cset_size, cset, - edge->params.cset); - e->next = cn->edges; - cn->edges = e; - e->type = edge->type; - e->dest = state_base + indx; - e->params.cset = cset; - edge = edge->next; - } - while (eclose) { - struct rx_possible_future *ec = new_close++; - struct rx_hash_item *sp; - struct rx_se_list **sepos; - struct rx_se_list *sesrc; - struct rx_nfa_state_set *destlst; - struct rx_nfa_state_set **destpos; - - ec->next = cn->futures; - cn->futures = ec; - for (sepos = &ec->effects, sesrc = eclose->effects; - sesrc; sesrc = sesrc->cdr, sepos = &(*sepos)->cdr) { - sp = rx_hash_find(&rx->se_list_memo, - (long) sesrc-> - car ^ (long) sesrc->cdr, sesrc, - &se_list_hash_rules); - if (sp->binding) { - sesrc = (struct rx_se_list *) sp->binding; - break; - } - *new_se_list = *sesrc; - sp->binding = (void *) new_se_list; - *sepos = new_se_list; - ++new_se_list; - } - *sepos = sesrc; - for (destpos = &ec->destset, destlst = eclose->destset; - destlst; - destpos = &(*destpos)->cdr, destlst = destlst->cdr) { - sp = rx_hash_find(&rx->set_list_memo, - ((((long) destlst->car) >> 8) - ^ (long) destlst->cdr), - destlst, &nfa_set_hash_rules); - if (sp->binding) { - destlst = (struct rx_nfa_state_set *) sp->binding; - break; - } - *new_nfa_set = *destlst; - new_nfa_set->car = state_base + destlst->car->id; - sp->binding = (void *) new_nfa_set; - *destpos = new_nfa_set; - ++new_nfa_set; - } - *destpos = destlst; - eclose = eclose->next; - } - } - } - rx_free_hash_table(&rx->se_list_memo, se_memo_freer, - &se_list_hash_rules); - bzero(&rx->se_list_memo, sizeof(rx->se_list_memo)); - rx_free_hash_table(&rx->set_list_memo, nfa_set_freer, - &nfa_set_hash_rules); - bzero(&rx->set_list_memo, sizeof(rx->set_list_memo)); - - rx_free_nfa(rx); - rx->nfa_states = (struct rx_nfa_state *) *mem; - return 1; -} - - -/* The functions in the next several pages define the lazy-NFA-conversion used - * by matchers. The input to this construction is an NFA such as - * is built by compactify_nfa (rx.c). The output is the superNFA. - */ - -/* Match engines can use arbitrary values for opcodes. So, the parse tree - * is built using instructions names (enum rx_opcode), but the superstate - * nfa is populated with mystery opcodes (void *). - * - * For convenience, here is an id table. The opcodes are == to their inxs - * - * The lables in re_search_2 would make good values for instructions. - */ - -void *rx_id_instruction_table[rx_num_instructions] = { - (void *) rx_backtrack_point, - (void *) rx_do_side_effects, - (void *) rx_cache_miss, - (void *) rx_next_char, - (void *) rx_backtrack, - (void *) rx_error_inx -}; - - - -/* Memory mgt. for superstate graphs. */ - -#ifdef __STDC__ -static char *rx_cache_malloc(struct rx_cache *cache, int bytes) -#else -static char *rx_cache_malloc(cache, bytes) -struct rx_cache *cache; -int bytes; -#endif -{ - while (cache->bytes_left < bytes) { - if (cache->memory_pos) - cache->memory_pos = cache->memory_pos->next; - if (!cache->memory_pos) { - cache->morecore(cache); - if (!cache->memory_pos) - return 0; - } - cache->bytes_left = cache->memory_pos->bytes; - cache->memory_addr = ((char *) cache->memory_pos - - + sizeof(struct rx_blocklist)); - } - cache->bytes_left -= bytes; - { - char *addr = cache->memory_addr; - - cache->memory_addr += bytes; - return addr; - } -} - -#ifdef __STDC__ -static void -rx_cache_free(struct rx_cache *cache, - struct rx_freelist **freelist, char *mem) -#else -static void rx_cache_free(cache, freelist, mem) -struct rx_cache *cache; -struct rx_freelist **freelist; -char *mem; -#endif -{ - struct rx_freelist *it = (struct rx_freelist *) mem; - - it->next = *freelist; - *freelist = it; -} - -/* The partially instantiated superstate graph has a transition - * table at every node. There is one entry for every character. - * This fills in the transition for a set. - */ -#ifdef __STDC__ -static void -install_transition(struct rx_superstate *super, - struct rx_inx *answer, rx_Bitset trcset) -#else -static void install_transition(super, answer, trcset) -struct rx_superstate *super; -struct rx_inx *answer; -rx_Bitset trcset; -#endif -{ - struct rx_inx *transitions = super->transitions; - int chr; - - for (chr = 0; chr < 256;) - if (!*trcset) { - ++trcset; - chr += 32; - } else { - RX_subset sub = *trcset; - RX_subset mask = 1; - int bound = chr + 32; - - while (chr < bound) { - if (sub & mask) - transitions[chr] = *answer; - ++chr; - mask <<= 1; - } - ++trcset; - } -} - -#ifdef __STDC__ -static int qlen(struct rx_superstate *q) -#else -static int qlen(q) -struct rx_superstate *q; -#endif -{ - int count = 1; - struct rx_superstate *it; - - if (!q) - return 0; - for (it = q->next_recyclable; it != q; it = it->next_recyclable) - ++count; - return count; -} - -#ifdef __STDC__ -static void check_cache(struct rx_cache *cache) -#else -static void check_cache(cache) -struct rx_cache *cache; -#endif -{ - struct rx_cache *you_fucked_up = 0; - int total = cache->superstates; - int semi = cache->semifree_superstates; - - if (semi != qlen(cache->semifree_superstate)) - check_cache(you_fucked_up); - if ((total - semi) != qlen(cache->lru_superstate)) - check_cache(you_fucked_up); -} - -/* When a superstate is old and neglected, it can enter a - * semi-free state. A semi-free state is slated to die. - * Incoming transitions to a semi-free state are re-written - * to cause an (interpreted) fault when they are taken. - * The fault handler revives the semi-free state, patches - * incoming transitions back to normal, and continues. - * - * The idea is basicly to free in two stages, aborting - * between the two if the state turns out to be useful again. - * When a free is aborted, the rescued superstate is placed - * in the most-favored slot to maximize the time until it - * is next semi-freed. - */ - -#ifdef __STDC__ -static void semifree_superstate(struct rx_cache *cache) -#else -static void semifree_superstate(cache) -struct rx_cache *cache; -#endif -{ - int disqualified = cache->semifree_superstates; - - if (disqualified == cache->superstates) - return; - while (cache->lru_superstate->locks) { - cache->lru_superstate = cache->lru_superstate->next_recyclable; - ++disqualified; - if (disqualified == cache->superstates) - return; - } - { - struct rx_superstate *it = cache->lru_superstate; - - it->next_recyclable->prev_recyclable = it->prev_recyclable; - it->prev_recyclable->next_recyclable = it->next_recyclable; - cache->lru_superstate = (it == it->next_recyclable - ? 0 : it->next_recyclable); - if (!cache->semifree_superstate) { - cache->semifree_superstate = it; - it->next_recyclable = it; - it->prev_recyclable = it; - } else { - it->prev_recyclable = - cache->semifree_superstate->prev_recyclable; - it->next_recyclable = cache->semifree_superstate; - it->prev_recyclable->next_recyclable = it; - it->next_recyclable->prev_recyclable = it; - } - { - struct rx_distinct_future *df; - - it->is_semifree = 1; - ++cache->semifree_superstates; - df = it->transition_refs; - if (df) { - df->prev_same_dest->next_same_dest = 0; - for (df = it->transition_refs; df; df = df->next_same_dest) { - df->future_frame.inx = - cache->instruction_table[rx_cache_miss]; - df->future_frame.data = 0; - df->future_frame.data_2 = (void *) df; - /* If there are any NEXT-CHAR instruction frames that - * refer to this state, we convert them to CACHE-MISS frames. - */ - if (!df->effects - && (df->edge->options->next_same_super_edge[0] - == df->edge->options)) - install_transition(df->present, &df->future_frame, - df->edge->cset); - } - df = it->transition_refs; - df->prev_same_dest->next_same_dest = df; - } - } - } -} - -#ifdef __STDC__ -static void -refresh_semifree_superstate(struct rx_cache *cache, - struct rx_superstate *super) -#else -static void refresh_semifree_superstate(cache, super) -struct rx_cache *cache; -struct rx_superstate *super; -#endif -{ - struct rx_distinct_future *df; - - if (super->transition_refs) { - super->transition_refs->prev_same_dest->next_same_dest = 0; - for (df = super->transition_refs; df; df = df->next_same_dest) { - df->future_frame.inx = cache->instruction_table[rx_next_char]; - df->future_frame.data = (void *) super->transitions; - /* CACHE-MISS instruction frames that refer to this state, - * must be converted to NEXT-CHAR frames. - */ - if (!df->effects && (df->edge->options->next_same_super_edge[0] - == df->edge->options)) - install_transition(df->present, &df->future_frame, - df->edge->cset); - } - super->transition_refs->prev_same_dest->next_same_dest - = super->transition_refs; - } - if (cache->semifree_superstate == super) - cache->semifree_superstate = (super->prev_recyclable == super - ? 0 : super->prev_recyclable); - super->next_recyclable->prev_recyclable = super->prev_recyclable; - super->prev_recyclable->next_recyclable = super->next_recyclable; - - if (!cache->lru_superstate) - (cache->lru_superstate - = super->next_recyclable = super->prev_recyclable = super); - else { - super->next_recyclable = cache->lru_superstate; - super->prev_recyclable = cache->lru_superstate->prev_recyclable; - super->next_recyclable->prev_recyclable = super; - super->prev_recyclable->next_recyclable = super; - } - super->is_semifree = 0; - --cache->semifree_superstates; -} - -#ifdef __STDC__ -static void -rx_refresh_this_superstate(struct rx_cache *cache, - struct rx_superstate *superstate) -#else -static void rx_refresh_this_superstate(cache, superstate) -struct rx_cache *cache; -struct rx_superstate *superstate; -#endif -{ - if (superstate->is_semifree) - refresh_semifree_superstate(cache, superstate); - else if (cache->lru_superstate == superstate) - cache->lru_superstate = superstate->next_recyclable; - else if (superstate != cache->lru_superstate->prev_recyclable) { - superstate->next_recyclable->prev_recyclable - = superstate->prev_recyclable; - superstate->prev_recyclable->next_recyclable - = superstate->next_recyclable; - superstate->next_recyclable = cache->lru_superstate; - superstate->prev_recyclable = - cache->lru_superstate->prev_recyclable; - superstate->next_recyclable->prev_recyclable = superstate; - superstate->prev_recyclable->next_recyclable = superstate; - } -} - -#ifdef __STDC__ -static void -release_superset_low(struct rx_cache *cache, struct rx_superset *set) -#else -static void release_superset_low(cache, set) -struct rx_cache *cache; -struct rx_superset *set; -#endif -{ - if (!--set->refs) { - if (set->cdr) - release_superset_low(cache, set->cdr); - - set->starts_for = 0; - - rx_hash_free - (rx_hash_find - (&cache->superset_table, - (unsigned long) set->car ^ set-> - id ^ (unsigned long) set->cdr, (void *) set, - &cache->superset_hash_rules), &cache->superset_hash_rules); - rx_cache_free(cache, &cache->free_supersets, (char *) set); - } -} - -#ifdef __STDC__ -RX_DECL void rx_release_superset(struct rx *rx, struct rx_superset *set) -#else -RX_DECL void rx_release_superset(rx, set) -struct rx *rx; -struct rx_superset *set; -#endif -{ - release_superset_low(rx->cache, set); -} - -/* This tries to add a new superstate to the superstate freelist. - * It might, as a result, free some edge pieces or hash tables. - * If nothing can be freed because too many locks are being held, fail. - */ - -#ifdef __STDC__ -static int rx_really_free_superstate(struct rx_cache *cache) -#else -static int rx_really_free_superstate(cache) -struct rx_cache *cache; -#endif -{ - int locked_superstates = 0; - struct rx_superstate *it; - - if (!cache->superstates) - return 0; - - { - /* This is a total guess. The idea is that we should expect as - * many misses as we've recently experienced. I.e., cache->misses - * should be the same as cache->semifree_superstates. - */ - while ((cache->hits + cache->misses) > cache->superstates_allowed) { - cache->hits >>= 1; - cache->misses >>= 1; - } - if (((cache->hits + cache->misses) * cache->semifree_superstates) - < (cache->superstates * cache->misses)) { - semifree_superstate(cache); - semifree_superstate(cache); - } - } - - while (cache->semifree_superstate && cache->semifree_superstate->locks) { - refresh_semifree_superstate(cache, cache->semifree_superstate); - ++locked_superstates; - if (locked_superstates == cache->superstates) - return 0; - } - - if (cache->semifree_superstate) { - it = cache->semifree_superstate; - it->next_recyclable->prev_recyclable = it->prev_recyclable; - it->prev_recyclable->next_recyclable = it->next_recyclable; - cache->semifree_superstate = ((it == it->next_recyclable) - ? 0 : it->next_recyclable); - --cache->semifree_superstates; - } else { - while (cache->lru_superstate->locks) { - cache->lru_superstate = cache->lru_superstate->next_recyclable; - ++locked_superstates; - if (locked_superstates == cache->superstates) - return 0; - } - it = cache->lru_superstate; - it->next_recyclable->prev_recyclable = it->prev_recyclable; - it->prev_recyclable->next_recyclable = it->next_recyclable; - cache->lru_superstate = ((it == it->next_recyclable) - ? 0 : it->next_recyclable); - } - - if (it->transition_refs) { - struct rx_distinct_future *df; - - for (df = it->transition_refs, - df->prev_same_dest->next_same_dest = 0; - df; df = df->next_same_dest) { - df->future_frame.inx = cache->instruction_table[rx_cache_miss]; - df->future_frame.data = 0; - df->future_frame.data_2 = (void *) df; - df->future = 0; - } - it->transition_refs->prev_same_dest->next_same_dest = - it->transition_refs; - } - { - struct rx_super_edge *tc = it->edges; - - while (tc) { - struct rx_distinct_future *df; - struct rx_super_edge *tct = tc->next; - - df = tc->options; - df->next_same_super_edge[1]->next_same_super_edge[0] = 0; - while (df) { - struct rx_distinct_future *dft = df; - - df = df->next_same_super_edge[0]; - - - if (dft->future && dft->future->transition_refs == dft) { - dft->future->transition_refs = dft->next_same_dest; - if (dft->future->transition_refs == dft) - dft->future->transition_refs = 0; - } - dft->next_same_dest->prev_same_dest = dft->prev_same_dest; - dft->prev_same_dest->next_same_dest = dft->next_same_dest; - rx_cache_free(cache, &cache->free_discernable_futures, - (char *) dft); - } - rx_cache_free(cache, &cache->free_transition_classes, - (char *) tc); - tc = tct; - } - } - - if (it->contents->superstate == it) - it->contents->superstate = 0; - release_superset_low(cache, it->contents); - rx_cache_free(cache, &cache->free_superstates, (char *) it); - --cache->superstates; - return 1; -} - -#ifdef __STDC__ -static char *rx_cache_get(struct rx_cache *cache, - struct rx_freelist **freelist) -#else -static char *rx_cache_get(cache, freelist) -struct rx_cache *cache; -struct rx_freelist **freelist; -#endif -{ - while (!*freelist && rx_really_free_superstate(cache)); - if (!*freelist) - return 0; - { - struct rx_freelist *it = *freelist; - - *freelist = it->next; - return (char *) it; - } -} - -#ifdef __STDC__ -static char *rx_cache_malloc_or_get(struct rx_cache *cache, - struct rx_freelist **freelist, - int bytes) -#else -static char *rx_cache_malloc_or_get(cache, freelist, bytes) -struct rx_cache *cache; -struct rx_freelist **freelist; -int bytes; -#endif -{ - if (!*freelist) { - char *answer = rx_cache_malloc(cache, bytes); - - if (answer) - return answer; - } - - return rx_cache_get(cache, freelist); -} - -#ifdef __STDC__ -static char *rx_cache_get_superstate(struct rx_cache *cache) -#else -static char *rx_cache_get_superstate(cache) -struct rx_cache *cache; -#endif -{ - char *answer; - int bytes = (sizeof(struct rx_superstate) - + cache->local_cset_size * sizeof(struct rx_inx)); - - if (!cache->free_superstates - && (cache->superstates < cache->superstates_allowed)) { - answer = rx_cache_malloc(cache, bytes); - if (answer) { - ++cache->superstates; - return answer; - } - } - answer = rx_cache_get(cache, &cache->free_superstates); - if (!answer) { - answer = rx_cache_malloc(cache, bytes); - if (answer) - ++cache->superstates_allowed; - } - ++cache->superstates; - return answer; -} - - - -#ifdef __STDC__ -static int supersetcmp(void *va, void *vb) -#else -static int supersetcmp(va, vb) -void *va; -void *vb; -#endif -{ - struct rx_superset *a = (struct rx_superset *) va; - struct rx_superset *b = (struct rx_superset *) vb; - - return ((a == b) - || (a && b && (a->car == b->car) && (a->cdr == b->cdr))); -} - -#ifdef __STDC__ -static struct rx_hash_item *superset_allocator(struct rx_hash_rules *rules, - void *val) -#else -static struct rx_hash_item *superset_allocator(rules, val) -struct rx_hash_rules *rules; -void *val; -#endif -{ - struct rx_cache *cache = ((struct rx_cache *) - ((char *) rules - - - - (unsigned - long) (&((struct rx_cache *) - 0)->superset_hash_rules))); - struct rx_superset *template = (struct rx_superset *) val; - struct rx_superset *newset - = ((struct rx_superset *) rx_cache_malloc_or_get(cache, - &cache->free_supersets, - sizeof - - (*template))); - if (!newset) - return 0; - newset->refs = 0; - newset->car = template->car; - newset->id = template->car->id; - newset->cdr = template->cdr; - newset->superstate = 0; - rx_protect_superset(rx, template->cdr); - newset->hash_item.data = (void *) newset; - newset->hash_item.binding = 0; - return &newset->hash_item; -} - -#ifdef __STDC__ -static struct rx_hash *super_hash_allocator(struct rx_hash_rules *rules) -#else -static struct rx_hash *super_hash_allocator(rules) -struct rx_hash_rules *rules; -#endif -{ - struct rx_cache *cache = ((struct rx_cache *) - ((char *) rules - - - - (unsigned - long) (&((struct rx_cache *) - 0)->superset_hash_rules))); - return ((struct rx_hash *) - rx_cache_malloc_or_get(cache, &cache->free_hash, - - sizeof(struct rx_hash))); -} - - -#ifdef __STDC__ -static void -super_hash_liberator(struct rx_hash *hash, struct rx_hash_rules *rules) -#else -static void super_hash_liberator(hash, rules) -struct rx_hash *hash; -struct rx_hash_rules *rules; -#endif -{ - struct rx_cache *cache = ((struct rx_cache *) - - (char *) rules - - (long) (& - - ((struct rx_cache *) - 0)->superset_hash_rules)); - rx_cache_free(cache, &cache->free_hash, (char *) hash); -} - -#ifdef __STDC__ -static void -superset_hash_item_liberator(struct rx_hash_item *it, - struct rx_hash_rules *rules) -#else -static void superset_hash_item_liberator(it, rules) /* Well, it does ya know. */ -struct rx_hash_item *it; -struct rx_hash_rules *rules; -#endif -{ -} - -int rx_cache_bound = 128; -static int rx_default_cache_got = 0; - -#ifdef __STDC__ -static int bytes_for_cache_size(int supers, int cset_size) -#else -static int bytes_for_cache_size(supers, cset_size) -int supers; -int cset_size; -#endif -{ - /* What the hell is this? !!! */ - return (int) - ((float) supers * ((1.03 * (float) (rx_sizeof_bitset(cset_size) - + - sizeof(struct rx_super_edge))) - + - (1.80 * - (float) sizeof(struct rx_possible_future)) + - (float) (sizeof(struct rx_superstate) - + cset_size * sizeof(struct rx_inx)))); -} - -#ifdef __STDC__ -static void rx_morecore(struct rx_cache *cache) -#else -static void rx_morecore(cache) -struct rx_cache *cache; -#endif -{ - if (rx_default_cache_got >= rx_cache_bound) - return; - - rx_default_cache_got += 16; - cache->superstates_allowed = rx_cache_bound; - { - struct rx_blocklist **pos = &cache->memory; - int size = bytes_for_cache_size(16, cache->local_cset_size); - - while (*pos) - pos = &(*pos)->next; - *pos = ((struct rx_blocklist *) - malloc(size + sizeof(struct rx_blocklist))); - - if (!*pos) - return; - - (*pos)->next = 0; - (*pos)->bytes = size; - cache->memory_pos = *pos; - cache->memory_addr = (char *) *pos + sizeof(**pos); - cache->bytes_left = size; - } -} - -static struct rx_cache default_cache = { - { - supersetcmp, - super_hash_allocator, - super_hash_liberator, - superset_allocator, - superset_hash_item_liberator, - }, - 0, - 0, - 0, - 0, - rx_morecore, - - 0, - 0, - 0, - 0, - 0, - - 0, - 0, - - 0, - - 0, - 0, - 0, - 0, - 128, - - 256, - rx_id_instruction_table, - - { - 0, - 0, - {0}, - {0}, - {0} - } -}; - -/* This adds an element to a superstate set. These sets are lists, such - * that lists with == elements are ==. The empty set is returned by - * superset_cons (rx, 0, 0) and is NOT equivelent to - * (struct rx_superset)0. - */ - -#ifdef __STDC__ -RX_DECL struct rx_superset *rx_superset_cons(struct rx *rx, - struct rx_nfa_state *car, - struct rx_superset *cdr) -#else -RX_DECL struct rx_superset *rx_superset_cons(rx, car, cdr) -struct rx *rx; -struct rx_nfa_state *car; -struct rx_superset *cdr; -#endif -{ - struct rx_cache *cache = rx->cache; - - if (!car && !cdr) { - if (!cache->empty_superset) { - cache->empty_superset = ((struct rx_superset *) - rx_cache_malloc_or_get(cache, - &cache->free_supersets, - - sizeof(struct - rx_superset))); - if (!cache->empty_superset) - return 0; - bzero(cache->empty_superset, sizeof(struct rx_superset)); - - cache->empty_superset->refs = 1000; - } - return cache->empty_superset; - } - { - struct rx_superset template; - struct rx_hash_item *hit; - - template.car = car; - template.cdr = cdr; - template.id = car->id; - /* While hash_store will protect cdr itself it might first allocate hash - tables and stuff which might cause it to be garbage collected before - it's protected -- [gsstark:19961026.2155EST] */ - rx_protect_superset(rx, cdr); - hit = rx_hash_store(&cache->superset_table, - (unsigned long) car ^ car->id ^ (unsigned long) - cdr, (void *) &template, - &cache->superset_hash_rules); - rx_release_superset(rx, cdr); - return (hit ? (struct rx_superset *) hit->data : 0); - } -} - -/* This computes a union of two NFA state sets. The sets do not have the - * same representation though. One is a RX_SUPERSET structure (part - * of the superstate NFA) and the other is an NFA_STATE_SET (part of the NFA). - */ - -#ifdef __STDC__ -RX_DECL struct rx_superset *rx_superstate_eclosure_union - (struct rx *rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) -#else -RX_DECL struct rx_superset *rx_superstate_eclosure_union(rx, set, ecl) -struct rx *rx; -struct rx_superset *set; -struct rx_nfa_state_set *ecl; -#endif -{ - if (!ecl) - return set; - - if (!set->car) - return rx_superset_cons(rx, ecl->car, - rx_superstate_eclosure_union(rx, set, - ecl->cdr)); - if (set->car == ecl->car) - return rx_superstate_eclosure_union(rx, set, ecl->cdr); - - { - struct rx_superset *tail; - struct rx_nfa_state *first; - - if (set->car > ecl->car) { - tail = rx_superstate_eclosure_union(rx, set->cdr, ecl); - first = set->car; - } else { - tail = rx_superstate_eclosure_union(rx, set, ecl->cdr); - first = ecl->car; - } - if (!tail) - return 0; - else { - struct rx_superset *answer; - - answer = rx_superset_cons(rx, first, tail); - if (!answer) { - rx_protect_superset(rx, tail); - rx_release_superset(rx, tail); - return 0; - } else - return answer; - } - } -} - - - - -/* - * This makes sure that a list of rx_distinct_futures contains - * a future for each possible set of side effects in the eclosure - * of a given state. This is some of the work of filling in a - * superstate transition. - */ - -#ifdef __STDC__ -static struct rx_distinct_future *include_futures(struct rx *rx, struct rx_distinct_future - *df, struct rx_nfa_state - *state, struct rx_superstate - *superstate) -#else -static struct rx_distinct_future *include_futures(rx, df, state, - superstate) -struct rx *rx; -struct rx_distinct_future *df; -struct rx_nfa_state *state; -struct rx_superstate *superstate; -#endif -{ - struct rx_possible_future *future; - struct rx_cache *cache = rx->cache; - - for (future = state->futures; future; future = future->next) { - struct rx_distinct_future *dfp; - struct rx_distinct_future *insert_before = 0; - - if (df) - df->next_same_super_edge[1]->next_same_super_edge[0] = 0; - for (dfp = df; dfp; dfp = dfp->next_same_super_edge[0]) - if (dfp->effects == future->effects) - break; - else { - int order = - - rx->se_list_cmp(rx, dfp->effects, future->effects); - if (order > 0) { - insert_before = dfp; - dfp = 0; - break; - } - } - if (df) - df->next_same_super_edge[1]->next_same_super_edge[0] = df; - if (!dfp) { - dfp = ((struct rx_distinct_future *) - rx_cache_malloc_or_get(cache, - &cache->free_discernable_futures, - - sizeof(struct - rx_distinct_future))); - if (!dfp) - return 0; - if (!df) { - df = insert_before = dfp; - df->next_same_super_edge[0] = df->next_same_super_edge[1] = - df; - } else if (!insert_before) - insert_before = df; - else if (insert_before == df) - df = dfp; - - dfp->next_same_super_edge[0] = insert_before; - dfp->next_same_super_edge[1] - = insert_before->next_same_super_edge[1]; - dfp->next_same_super_edge[1]->next_same_super_edge[0] = dfp; - dfp->next_same_super_edge[0]->next_same_super_edge[1] = dfp; - dfp->next_same_dest = dfp->prev_same_dest = dfp; - dfp->future = 0; - dfp->present = superstate; - dfp->future_frame.inx = rx->instruction_table[rx_cache_miss]; - dfp->future_frame.data = 0; - dfp->future_frame.data_2 = (void *) dfp; - dfp->side_effects_frame.inx - = rx->instruction_table[rx_do_side_effects]; - dfp->side_effects_frame.data = 0; - dfp->side_effects_frame.data_2 = (void *) dfp; - dfp->effects = future->effects; - } - } - return df; -} - - -/* This constructs a new superstate from its state set. The only - * complexity here is memory management. - */ -#ifdef __STDC__ -RX_DECL struct rx_superstate *rx_superstate(struct rx *rx, - struct rx_superset *set) -#else -RX_DECL struct rx_superstate *rx_superstate(rx, set) -struct rx *rx; -struct rx_superset *set; -#endif -{ - struct rx_cache *cache = rx->cache; - struct rx_superstate *superstate = 0; - - /* Does the superstate already exist in the cache? */ - if (set->superstate) { - if (set->superstate->rx_id != rx->rx_id) { - /* Aha. It is in the cache, but belongs to a superstate - * that refers to an NFA that no longer exists. - * (We know it no longer exists because it was evidently - * stored in the same region of memory as the current nfa - * yet it has a different id.) - */ - superstate = set->superstate; - if (!superstate->is_semifree) { - if (cache->lru_superstate == superstate) { - cache->lru_superstate = superstate->next_recyclable; - if (cache->lru_superstate == superstate) - cache->lru_superstate = 0; - } - { - superstate->next_recyclable->prev_recyclable - = superstate->prev_recyclable; - superstate->prev_recyclable->next_recyclable - = superstate->next_recyclable; - if (!cache->semifree_superstate) { - (cache->semifree_superstate - = superstate->next_recyclable - = superstate->prev_recyclable = superstate); - } else { - superstate->next_recyclable = - cache->semifree_superstate; - superstate->prev_recyclable = - cache->semifree_superstate->prev_recyclable; - superstate->next_recyclable->prev_recyclable = - superstate; - superstate->prev_recyclable->next_recyclable = - superstate; - cache->semifree_superstate = superstate; - } - ++cache->semifree_superstates; - } - } - set->superstate = 0; - goto handle_cache_miss; - } - ++cache->hits; - superstate = set->superstate; - - rx_refresh_this_superstate(cache, superstate); - return superstate; - } - - handle_cache_miss: - - /* This point reached only for cache misses. */ - ++cache->misses; -#if RX_DEBUG - if (rx_debug_trace > 1) { - struct rx_superset *setp = set; - - fprintf(stderr, "Building a superstet %d(%d): ", rx->rx_id, set); - while (setp) { - fprintf(stderr, "%d ", setp->id); - setp = setp->cdr; - } - fprintf(stderr, "(%d)\n", set); - } -#endif - superstate = (struct rx_superstate *) rx_cache_get_superstate(cache); - if (!superstate) - return 0; - - if (!cache->lru_superstate) - (cache->lru_superstate - = superstate->next_recyclable - = superstate->prev_recyclable = superstate); - else { - superstate->next_recyclable = cache->lru_superstate; - superstate->prev_recyclable = - cache->lru_superstate->prev_recyclable; - (superstate->prev_recyclable->next_recyclable = - superstate->next_recyclable->prev_recyclable = superstate); - } - superstate->rx_id = rx->rx_id; - superstate->transition_refs = 0; - superstate->locks = 0; - superstate->is_semifree = 0; - set->superstate = superstate; - superstate->contents = set; - rx_protect_superset(rx, set); - superstate->edges = 0; - { - int x; - - /* None of the transitions from this superstate are known yet. */ - for (x = 0; x < rx->local_cset_size; ++x) { /* &&&&& 3.8 % */ - struct rx_inx *ifr = &superstate->transitions[x]; - - ifr->inx = rx->instruction_table[rx_cache_miss]; - ifr->data = ifr->data_2 = 0; - } - } - return superstate; -} - - -/* This computes the destination set of one edge of the superstate NFA. - * Note that a RX_DISTINCT_FUTURE is a superstate edge. - * Returns 0 on an allocation failure. - */ - -#ifdef __STDC__ -static int solve_destination(struct rx *rx, struct rx_distinct_future *df) -#else -static int solve_destination(rx, df) -struct rx *rx; -struct rx_distinct_future *df; -#endif -{ - struct rx_super_edge *tc = df->edge; - struct rx_superset *nfa_state; - struct rx_superset *nil_set = rx_superset_cons(rx, 0, 0); - struct rx_superset *solution = nil_set; - struct rx_superstate *dest; - - rx_protect_superset(rx, solution); - /* Iterate over all NFA states in the state set of this superstate. */ - for (nfa_state = df->present->contents; - nfa_state->car; nfa_state = nfa_state->cdr) { - struct rx_nfa_edge *e; - - /* Iterate over all edges of each NFA state. */ - for (e = nfa_state->car->edges; e; e = e->next) - /* If we find an edge that is labeled with - * the characters we are solving for..... - */ - if (rx_bitset_is_subset(rx->local_cset_size, - tc->cset, e->params.cset)) { - struct rx_nfa_state *n = e->dest; - struct rx_possible_future *pf; - - /* ....search the partial epsilon closures of the destination - * of that edge for a path that involves the same set of - * side effects we are solving for. - * If we find such a RX_POSSIBLE_FUTURE, we add members to the - * stateset we are computing. - */ - for (pf = n->futures; pf; pf = pf->next) - if (pf->effects == df->effects) { - struct rx_superset *old_sol; - - old_sol = solution; - solution = - rx_superstate_eclosure_union(rx, solution, - pf->destset); - if (!solution) - return 0; - rx_protect_superset(rx, solution); - rx_release_superset(rx, old_sol); - } - } - } - /* It is possible that the RX_DISTINCT_FUTURE we are working on has - * the empty set of NFA states as its definition. In that case, this - * is a failure point. - */ - if (solution == nil_set) { - df->future_frame.inx = (void *) rx_backtrack; - df->future_frame.data = 0; - df->future_frame.data_2 = 0; - return 1; - } - dest = rx_superstate(rx, solution); - rx_release_superset(rx, solution); - if (!dest) - return 0; - - { - struct rx_distinct_future *dft; - - dft = df; - df->prev_same_dest->next_same_dest = 0; - while (dft) { - dft->future = dest; - dft->future_frame.inx = rx->instruction_table[rx_next_char]; - dft->future_frame.data = (void *) dest->transitions; - dft = dft->next_same_dest; - } - df->prev_same_dest->next_same_dest = df; - } - if (!dest->transition_refs) - dest->transition_refs = df; - else { - struct rx_distinct_future *dft = - - dest->transition_refs->next_same_dest; - dest->transition_refs->next_same_dest = df->next_same_dest; - df->next_same_dest->prev_same_dest = dest->transition_refs; - df->next_same_dest = dft; - dft->prev_same_dest = df; - } - return 1; -} - - -/* This takes a superstate and a character, and computes some edges - * from the superstate NFA. In particular, this computes all edges - * that lead from SUPERSTATE given CHR. This function also - * computes the set of characters that share this edge set. - * This returns 0 on allocation error. - * The character set and list of edges are returned through - * the paramters CSETOUT and DFOUT. -} */ - -#ifdef __STDC__ -static int -compute_super_edge(struct rx *rx, struct rx_distinct_future **dfout, - rx_Bitset csetout, struct rx_superstate *superstate, - unsigned char chr) -#else -static int compute_super_edge(rx, dfout, csetout, superstate, chr) -struct rx *rx; -struct rx_distinct_future **dfout; -rx_Bitset csetout; -struct rx_superstate *superstate; -unsigned char chr; -#endif -{ - struct rx_superset *stateset = superstate->contents; - - /* To compute the set of characters that share edges with CHR, - * we start with the full character set, and subtract. - */ - rx_bitset_universe(rx->local_cset_size, csetout); - *dfout = 0; - - /* Iterate over the NFA states in the superstate state-set. */ - while (stateset->car) { - struct rx_nfa_edge *e; - - for (e = stateset->car->edges; e; e = e->next) - if (RX_bitset_member(e->params.cset, chr)) { - /* If we find an NFA edge that applies, we make sure there - * are corresponding edges in the superstate NFA. - */ - { - struct rx_distinct_future *saved; - - saved = *dfout; - *dfout = - include_futures(rx, *dfout, e->dest, superstate); - if (!*dfout) { - struct rx_distinct_future *df; - - df = saved; - if (df) - df-> - next_same_super_edge - [1]->next_same_super_edge[0] = 0; - while (df) { - struct rx_distinct_future *dft; - - dft = df; - df = df->next_same_super_edge[0]; - - if (dft->future - && dft->future->transition_refs == dft) { - dft->future->transition_refs = - dft->next_same_dest; - if (dft->future->transition_refs == dft) - dft->future->transition_refs = 0; - } - dft->next_same_dest->prev_same_dest = - dft->prev_same_dest; - dft->prev_same_dest->next_same_dest = - dft->next_same_dest; - rx_cache_free(rx->cache, - &rx-> - cache->free_discernable_futures, - (char *) dft); - } - return 0; - } - } - /* We also trim the character set a bit. */ - rx_bitset_intersection(rx->local_cset_size, - csetout, e->params.cset); - } else - /* An edge that doesn't apply at least tells us some characters - * that don't share the same edge set as CHR. - */ - rx_bitset_difference(rx->local_cset_size, csetout, - e->params.cset); - stateset = stateset->cdr; - } - return 1; -} - - -/* This is a constructor for RX_SUPER_EDGE structures. These are - * wrappers for lists of superstate NFA edges that share character sets labels. - * If a transition class contains more than one rx_distinct_future (superstate - * edge), then it represents a non-determinism in the superstate NFA. - */ - - -#ifdef __STDC__ -static struct rx_super_edge *rx_super_edge(struct rx *rx, - struct rx_superstate *super, - rx_Bitset cset, - struct rx_distinct_future *df) -#else -static struct rx_super_edge *rx_super_edge(rx, super, cset, df) -struct rx *rx; -struct rx_superstate *super; -rx_Bitset cset; -struct rx_distinct_future *df; -#endif -{ - struct rx_super_edge *tc = - (struct rx_super_edge *) rx_cache_malloc_or_get - (rx->cache, &rx->cache->free_transition_classes, - sizeof(struct rx_super_edge) + - - rx_sizeof_bitset(rx->local_cset_size)); - - if (!tc) - return 0; - tc->next = super->edges; - super->edges = tc; - tc->rx_backtrack_frame.inx = rx->instruction_table[rx_backtrack_point]; - tc->rx_backtrack_frame.data = 0; - tc->rx_backtrack_frame.data_2 = (void *) tc; - tc->options = df; - tc->cset = (rx_Bitset) ((char *) tc + sizeof(*tc)); - rx_bitset_assign(rx->local_cset_size, tc->cset, cset); - if (df) { - struct rx_distinct_future *dfp = df; - - df->next_same_super_edge[1]->next_same_super_edge[0] = 0; - while (dfp) { - dfp->edge = tc; - dfp = dfp->next_same_super_edge[0]; - } - df->next_same_super_edge[1]->next_same_super_edge[0] = df; - } - return tc; -} - - -/* There are three kinds of cache miss. The first occurs when a - * transition is taken that has never been computed during the - * lifetime of the source superstate. That cache miss is handled by - * calling COMPUTE_SUPER_EDGE. The second kind of cache miss - * occurs when the destination superstate of a transition doesn't - * exist. SOLVE_DESTINATION is used to construct the destination superstate. - * Finally, the third kind of cache miss occurs when the destination - * superstate of a transition is in a `semi-free state'. That case is - * handled by UNFREE_SUPERSTATE. - * - * The function of HANDLE_CACHE_MISS is to figure out which of these - * cases applies. - */ - - -#ifdef __STDC__ -static void -install_partial_transition(struct rx_superstate *super, - struct rx_inx *answer, - RX_subset set, int offset) -#else -static void install_partial_transition(super, answer, set, offset) -struct rx_superstate *super; -struct rx_inx *answer; -RX_subset set; -int offset; -#endif -{ - int start = offset; - int end = start + 32; - RX_subset pos = 1; - struct rx_inx *transitions = super->transitions; - - while (start < end) { - if (set & pos) - transitions[start] = *answer; - pos <<= 1; - ++start; - } -} - -#ifdef __STDC__ -RX_DECL struct rx_inx *rx_handle_cache_miss - (struct rx *rx, struct rx_superstate *super, unsigned char chr, - void *data) -#else -RX_DECL struct rx_inx *rx_handle_cache_miss(rx, super, chr, data) -struct rx *rx; -struct rx_superstate *super; -unsigned char chr; -void *data; -#endif -{ - int offset = chr / RX_subset_bits; - struct rx_distinct_future *df = data; - - if (!df) { /* must be the shared_cache_miss_frame */ - /* Perhaps this is just a transition waiting to be filled. */ - struct rx_super_edge *tc; - RX_subset mask = rx_subset_singletons[chr % RX_subset_bits]; - - for (tc = super->edges; tc; tc = tc->next) - if (tc->cset[offset] & mask) { - struct rx_inx *answer; - - df = tc->options; - answer = - ((tc->options->next_same_super_edge[0] != - tc->options) ? &tc-> - rx_backtrack_frame : (df->effects ? - &df->side_effects_frame : - &df->future_frame)); - install_partial_transition(super, answer, tc->cset[offset], - offset * 32); - return answer; - } - /* Otherwise, it's a flushed or newly encountered edge. */ - { - char cset_space[1024]; /* this limit is far from unreasonable */ - rx_Bitset trcset; - struct rx_inx *answer; - - if (rx_sizeof_bitset(rx->local_cset_size) > sizeof(cset_space)) - return 0; /* If the arbitrary limit is hit, always fail */ - /* cleanly. */ - trcset = (rx_Bitset) cset_space; - rx_lock_superstate(rx, super); - if (!compute_super_edge(rx, &df, trcset, super, chr)) { - rx_unlock_superstate(rx, super); - return 0; - } - if (!df) { /* We just computed the fail transition. */ - static struct rx_inx - shared_fail_frame = { 0, 0, (void *) rx_backtrack, 0 }; - - answer = &shared_fail_frame; - } else { - tc = rx_super_edge(rx, super, trcset, df); - if (!tc) { - rx_unlock_superstate(rx, super); - return 0; - } - answer = - ((tc->options->next_same_super_edge[0] != - tc->options) ? &tc-> - rx_backtrack_frame : (df->effects ? - &df->side_effects_frame : - &df->future_frame)); - } - install_partial_transition(super, answer, - trcset[offset], offset * 32); - rx_unlock_superstate(rx, super); - return answer; - } - } else if (df->future) { /* A cache miss on an edge with a future? Must be - * a semi-free destination. */ - if (df->future->is_semifree) - refresh_semifree_superstate(rx->cache, df->future); - return &df->future_frame; - } else - /* no future superstate on an existing edge */ - { - rx_lock_superstate(rx, super); - if (!solve_destination(rx, df)) { - rx_unlock_superstate(rx, super); - return 0; - } - if (!df->effects - && (df->edge->options->next_same_super_edge[0] == - df->edge->options)) install_partial_transition(super, - &df->future_frame, - df-> - edge->cset - [offset], - offset * - 32); - rx_unlock_superstate(rx, super); - return &df->future_frame; - } -} - - - - -/* The rest of the code provides a regex.c compatable interface. */ - - -__const__ char *re_error_msg[] = { - 0, /* REG_NOUT */ - "No match", /* REG_NOMATCH */ - "Invalid regular expression", /* REG_BADPAT */ - "Invalid collation character", /* REG_ECOLLATE */ - "Invalid character class name", /* REG_ECTYPE */ - "Trailing backslash", /* REG_EESCAPE */ - "Invalid back reference", /* REG_ESUBREG */ - "Unmatched [ or [^", /* REG_EBRACK */ - "Unmatched ( or \\(", /* REG_EPAREN */ - "Unmatched \\{", /* REG_EBRACE */ - "Invalid content of \\{\\}", /* REG_BADBR */ - "Invalid range end", /* REG_ERANGE */ - "Memory exhausted", /* REG_ESPACE */ - "Invalid preceding regular expression", /* REG_BADRPT */ - "Premature end of regular expression", /* REG_EEND */ - "Regular expression too big", /* REG_ESIZE */ - "Unmatched ) or \\)", /* REG_ERPAREN */ -}; - - - -/* - * Macros used while compiling patterns. - * - * By convention, PEND points just past the end of the uncompiled pattern, - * P points to the read position in the pattern. `translate' is the name - * of the translation table (`TRANSLATE' is the name of a macro that looks - * things up in `translate'). - */ - - -/* - * Fetch the next character in the uncompiled pattern---translating it - * if necessary. *Also cast from a signed character in the constant - * string passed to us by the user to an unsigned char that we can use - * as an array index (in, e.g., `translate'). - */ -#define PATFETCH(c) \ - do {if (p == pend) return REG_EEND; \ - c = (unsigned char) *p++; \ - c = translate[c]; \ - } while (0) - -/* - * Fetch the next character in the uncompiled pattern, with no - * translation. - */ -#define PATFETCH_RAW(c) \ - do {if (p == pend) return REG_EEND; \ - c = (unsigned char) *p++; \ - } while (0) - -/* Go backwards one character in the pattern. */ -#define PATUNFETCH p-- - - -#define TRANSLATE(d) translate[(unsigned char) (d)] - -typedef unsigned regnum_t; - -/* Since offsets can go either forwards or backwards, this type needs to - * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. - */ -typedef int pattern_offset_t; - -typedef struct { - struct rexp_node **top_expression; /* was begalt */ - struct rexp_node **last_expression; /* was laststart */ - pattern_offset_t inner_group_offset; - regnum_t regnum; -} compile_stack_elt_t; -typedef struct { - compile_stack_elt_t *stack; - unsigned size; - unsigned avail; /* Offset of next open position. */ -} compile_stack_type; - -static boolean group_in_compile_stack(compile_stack_type, regnum_t); -static reg_errcode_t -compile_range(struct re_pattern_buffer *, rx_Bitset, - __const__ char **, __const__ char *, - unsigned char *, reg_syntax_t, rx_Bitset, char *); -static void find_backrefs(char *, struct rexp_node *, - - struct re_se_params *); -static int compute_fastset(struct re_pattern_buffer *, struct rexp_node *); -static int is_anchored(struct rexp_node *, rx_side_effect); -static struct rexp_node -*remove_unecessary_side_effects - - (struct rx *, char *, struct rexp_node *, struct re_se_params *); -static int pointless_if_repeated(struct rexp_node *, - - struct re_se_params *); -static int registers_on_stack(struct re_pattern_buffer *, - struct rexp_node *, - - int, struct re_se_params *); -static int has_any_se(struct rx *, struct rexp_node *); -static int has_non_idempotent_epsilon_path - - (struct rx *, struct rexp_node *, struct re_se_params *); -static int begins_with_complex_se(struct rx *, struct rexp_node *); -static void speed_up_alt(struct rx *, struct rexp_node *, int); -RX_DECL reg_errcode_t - -rx_compile(__const__ char *, int, reg_syntax_t, - struct re_pattern_buffer *); -RX_DECL void rx_blow_up_fastmap(struct re_pattern_buffer *); -static __inline__ enum rx_get_burst_return -re_search_2_get_burst(struct rx_string_position *, void *, int); -static __inline__ enum rx_back_check_return -re_search_2_back_check(struct rx_string_position *, int, - int, unsigned char *, void *, int); -static __inline__ int -re_search_2_fetch_char(struct rx_string_position *, int, void *, int); - - -#define INIT_COMPILE_STACK_SIZE 32 - -#define COMPILE_STACK_EMPTY (compile_stack.avail == 0) -#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) - -/* The next available element. */ -#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) - - -/* Set the bit for character C in a list. */ -#define SET_LIST_BIT(c) \ - (b[((unsigned char) (c)) / CHARBITS] \ - |= 1 << (((unsigned char) c) % CHARBITS)) - -/* Get the next unsigned number in the uncompiled pattern. */ -#define GET_UNSIGNED_NUMBER(num) \ - { if (p != pend) \ - { \ - PATFETCH (c); \ - while (isdigit (c)) \ - { \ - if (num < 0) \ - num = 0; \ - num = num * 10 + c - '0'; \ - if (p == pend) \ - break; \ - PATFETCH (c); \ - } \ - } \ - } - -#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ - -#define IS_CHAR_CLASS(string) \ - (!strcmp (string, "alpha") || !strcmp (string, "upper") \ - || !strcmp (string, "lower") || !strcmp (string, "digit") \ - || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \ - || !strcmp (string, "space") || !strcmp (string, "print") \ - || !strcmp (string, "punct") || !strcmp (string, "graph") \ - || !strcmp (string, "cntrl") || !strcmp (string, "blank")) - - -/* These predicates are used in regex_compile. */ - -/* P points to just after a ^ in PATTERN. Return true if that ^ comes - * after an alternative or a begin-subexpression. We assume there is at - * least one character before the ^. - */ - -#ifdef __STDC__ -static boolean -at_begline_loc_p(__const__ char *pattern, __const__ char *p, - reg_syntax_t syntax) -#else -static boolean at_begline_loc_p(pattern, p, syntax) -__const__ char *pattern; -__const__ char *p; -reg_syntax_t syntax; -#endif -{ - __const__ char *prev = p - 2; - boolean prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\')); - - return ( /* After a subexpression? */ - ((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash)) - || - /* After an alternative? */ - ((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash)) - ); -} - -/* The dual of at_begline_loc_p. This one is for $. We assume there is - * at least one character after the $, i.e., `P < PEND'. - */ - -#ifdef __STDC__ -static boolean -at_endline_loc_p(__const__ char *p, __const__ char *pend, int syntax) -#else -static boolean at_endline_loc_p(p, pend, syntax) -__const__ char *p; -__const__ char *pend; -int syntax; -#endif -{ - __const__ char *next = p; - boolean next_backslash = (*next == '\\'); - __const__ char *next_next = (p + 1 < pend) ? (p + 1) : 0; - - return ( - /* Before a subexpression? */ - ((syntax & RE_NO_BK_PARENS) - ? (*next == ')') - : (next_backslash && next_next && (*next_next == ')'))) - || - /* Before an alternative? */ - ((syntax & RE_NO_BK_VBAR) - ? (*next == '|') - : (next_backslash && next_next && (*next_next == '|'))) - ); -} - - -unsigned char rx_id_translation[256] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, - 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, - 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, - - 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, - 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, - 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, - 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, - 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, - 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, - 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, - 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, - - 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, - 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, - 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, - 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, - 250, 251, 252, 253, 254, 255 -}; - -/* The compiler keeps an inverted translation table. - * This looks up/inititalize elements. - * VALID is an array of booleans that validate CACHE. - */ - -#ifdef __STDC__ -static rx_Bitset -inverse_translation(struct re_pattern_buffer *rxb, - char *valid, rx_Bitset cache, - unsigned char *translate, int c) -#else -static rx_Bitset inverse_translation(rxb, valid, cache, translate, c) -struct re_pattern_buffer *rxb; -char *valid; -rx_Bitset cache; -unsigned char *translate; -int c; -#endif -{ - rx_Bitset cs - - = cache + c * rx_bitset_numb_subsets(rxb->rx.local_cset_size); - - if (!valid[c]) { - int x; - int c_tr = TRANSLATE(c); - - rx_bitset_null(rxb->rx.local_cset_size, cs); - for (x = 0; x < 256; ++x) /* &&&& 13.37 */ - if (TRANSLATE(x) == c_tr) - RX_bitset_enjoin(cs, x); - valid[c] = 1; - } - return cs; -} - - - - -/* More subroutine declarations and macros for regex_compile. */ - -/* Returns true if REGNUM is in one of COMPILE_STACK's elements and - false if it's not. */ - -#ifdef __STDC__ -static boolean -group_in_compile_stack(compile_stack_type compile_stack, regnum_t regnum) -#else -static boolean group_in_compile_stack(compile_stack, regnum) -compile_stack_type compile_stack; -regnum_t regnum; -#endif -{ - int this_element; - - for (this_element = compile_stack.avail - 1; - this_element >= 0; this_element--) - if (compile_stack.stack[this_element].regnum == regnum) - return true; - - return false; -} - - -/* - * Read the ending character of a range (in a bracket expression) from the - * uncompiled pattern *P_PTR (which ends at PEND). We assume the - * starting character is in `P[-2]'. (`P[-1]' is the character `-'.) - * Then we set the translation of all bits between the starting and - * ending characters (inclusive) in the compiled pattern B. - * - * Return an error code. - * - * We use these short variable names so we can use the same macros as - * `regex_compile' itself. - */ - -#ifdef __STDC__ -static reg_errcode_t -compile_range(struct re_pattern_buffer *rxb, rx_Bitset cs, - __const__ char **p_ptr, __const__ char *pend, - unsigned char *translate, reg_syntax_t syntax, - rx_Bitset inv_tr, char *valid_inv_tr) -#else -static reg_errcode_t -compile_range(rxb, cs, p_ptr, pend, translate, syntax, inv_tr, - valid_inv_tr) -struct re_pattern_buffer *rxb; -rx_Bitset cs; -__const__ char **p_ptr; -__const__ char *pend; -unsigned char *translate; -reg_syntax_t syntax; -rx_Bitset inv_tr; -char *valid_inv_tr; -#endif -{ - unsigned this_char; - - __const__ char *p = *p_ptr; - - unsigned char range_end; - unsigned char range_start = TRANSLATE(p[-2]); - - if (p == pend) - return REG_ERANGE; - - PATFETCH(range_end); - - (*p_ptr)++; - - if (range_start > range_end) - return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; - - for (this_char = range_start; this_char <= range_end; this_char++) { - rx_Bitset it = - inverse_translation(rxb, valid_inv_tr, inv_tr, translate, - - this_char); - - rx_bitset_union(rxb->rx.local_cset_size, cs, it); - } - - return REG_NOERROR; -} - - -/* This searches a regexp for backreference side effects. - * It fills in the array OUT with 1 at the index of every register pair - * referenced by a backreference. - * - * This is used to help optimize patterns for searching. The information is - * useful because, if the caller doesn't want register values, backreferenced - * registers are the only registers for which we need rx_backtrack. - */ - -#ifdef __STDC__ -static void -find_backrefs(char *out, struct rexp_node *rexp, - struct re_se_params *params) -#else -static void find_backrefs(out, rexp, params) -char *out; -struct rexp_node *rexp; -struct re_se_params *params; -#endif -{ - if (rexp) - switch (rexp->type) { - case r_cset: - case r_data: - return; - case r_alternate: - case r_concat: - case r_opt: - case r_star: - case r_2phase_star: - find_backrefs(out, rexp->params.pair.left, params); - find_backrefs(out, rexp->params.pair.right, params); - return; - case r_side_effect: - if (((long) rexp->params.side_effect >= 0) - && (params[(long) rexp->params.side_effect].se == - re_se_backref)) - out[params[(long) rexp->params.side_effect].op1] = 1; - return; - } -} - - - -/* Returns 0 unless the pattern can match the empty string. */ - -#ifdef __STDC__ -static int -compute_fastset(struct re_pattern_buffer *rxb, struct rexp_node *rexp) -#else -static int compute_fastset(rxb, rexp) -struct re_pattern_buffer *rxb; -struct rexp_node *rexp; -#endif -{ - if (!rexp) - return 1; - switch (rexp->type) { - case r_data: - return 1; - case r_cset: - { - rx_bitset_union(rxb->rx.local_cset_size, - rxb->fastset, rexp->params.cset); - } - return 0; - case r_concat: - return (compute_fastset(rxb, rexp->params.pair.left) - && compute_fastset(rxb, rexp->params.pair.right)); - case r_2phase_star: - compute_fastset(rxb, rexp->params.pair.left); - /* compute_fastset (rxb, rexp->params.pair.right); nope... */ - return 1; - case r_alternate: - return !!(compute_fastset(rxb, rexp->params.pair.left) - + compute_fastset(rxb, rexp->params.pair.right)); - case r_opt: - case r_star: - compute_fastset(rxb, rexp->params.pair.left); - return 1; - case r_side_effect: - return 1; - } - - /* this should never happen */ - return 0; -} - - -/* returns - * 1 -- yes, definately anchored by the given side effect. - * 2 -- maybe anchored, maybe the empty string. - * 0 -- definately not anchored - * There is simply no other possibility. - */ - -#ifdef __STDC__ -static int is_anchored(struct rexp_node *rexp, rx_side_effect se) -#else -static int is_anchored(rexp, se) -struct rexp_node *rexp; -rx_side_effect se; -#endif -{ - if (!rexp) - return 2; - switch (rexp->type) { - case r_cset: - case r_data: - return 0; - case r_concat: - case r_2phase_star: - { - int l = is_anchored(rexp->params.pair.left, se); - - return (l == 2 ? is_anchored(rexp->params.pair.right, se) : l); - } - case r_alternate: - { - int l = is_anchored(rexp->params.pair.left, se); - int r = l ? is_anchored(rexp->params.pair.right, se) : 0; - - if (l == r) - return l; - else if ((l == 0) || (r == 0)) - return 0; - else - return 2; - } - case r_opt: - case r_star: - return is_anchored(rexp->params.pair.left, se) ? 2 : 0; - - case r_side_effect: - return ((rexp->params.side_effect == se) - ? 1 : 2); - } - - /* this should never happen */ - return 0; -} - - -/* This removes register assignments that aren't required by backreferencing. - * This can speed up explore_future, especially if it eliminates - * non-determinism in the superstate NFA. - * - * NEEDED is an array of characters, presumably filled in by FIND_BACKREFS. - * The non-zero elements of the array indicate which register assignments - * can NOT be removed from the expression. - */ - -#ifdef __STDC__ -static struct rexp_node *remove_unecessary_side_effects(struct rx *rx, - char *needed, - struct rexp_node - *rexp, - struct re_se_params - *params) -#else -static struct rexp_node *remove_unecessary_side_effects(rx, needed, rexp, - params) -struct rx *rx; -char *needed; -struct rexp_node *rexp; -struct re_se_params *params; -#endif -{ - struct rexp_node *l; - struct rexp_node *r; - - if (!rexp) - return 0; - else - switch (rexp->type) { - case r_cset: - case r_data: - return rexp; - case r_alternate: - case r_concat: - case r_2phase_star: - l = remove_unecessary_side_effects(rx, needed, - rexp->params.pair.left, - params); - r = - remove_unecessary_side_effects(rx, needed, - rexp->params.pair.right, - params); - if ((l && r) || (rexp->type != r_concat)) { - rexp->params.pair.left = l; - rexp->params.pair.right = r; - return rexp; - } else { - rexp->params.pair.left = rexp->params.pair.right = 0; - rx_free_rexp(rx, rexp); - return l ? l : r; - } - case r_opt: - case r_star: - l = remove_unecessary_side_effects(rx, needed, - rexp->params.pair.left, - params); - if (l) { - rexp->params.pair.left = l; - return rexp; - } else { - rexp->params.pair.left = 0; - rx_free_rexp(rx, rexp); - return 0; - } - case r_side_effect: - { - int se = (long) rexp->params.side_effect; - - if ((se >= 0) - && (((enum re_side_effects) params[se].se == re_se_lparen) - || ((enum re_side_effects) params[se].se == - re_se_rparen)) && (params[se].op1 > 0) - && (!needed[params[se].op1])) { - rx_free_rexp(rx, rexp); - return 0; - } else - return rexp; - } - } - - /* this should never happen */ - return 0; -} - - - -#ifdef __STDC__ -static int -pointless_if_repeated(struct rexp_node *node, struct re_se_params *params) -#else -static int pointless_if_repeated(node, params) -struct rexp_node *node; -struct re_se_params *params; -#endif -{ - if (!node) - return 1; - switch (node->type) { - case r_cset: - return 0; - case r_alternate: - case r_concat: - case r_2phase_star: - return (pointless_if_repeated(node->params.pair.left, params) - && pointless_if_repeated(node->params.pair.right, params)); - case r_opt: - case r_star: - return pointless_if_repeated(node->params.pair.left, params); - case r_side_effect: - switch (((long) node->params.side_effect < 0) - ? (enum re_side_effects) node->params.side_effect - : (enum re_side_effects) params[(long) node-> - params.side_effect].se) { - case re_se_try: - case re_se_at_dot: - case re_se_begbuf: - case re_se_hat: - case re_se_wordbeg: - case re_se_wordbound: - case re_se_notwordbound: - case re_se_wordend: - case re_se_endbuf: - case re_se_dollar: - case re_se_fail: - case re_se_win: - return 1; - case re_se_lparen: - case re_se_rparen: - case re_se_iter: - case re_se_end_iter: - case re_se_syntax: - case re_se_not_syntax: - case re_se_backref: - return 0; - } - case r_data: - default: - return 0; - } -} - - - -#ifdef __STDC__ -static int -registers_on_stack(struct re_pattern_buffer *rxb, - struct rexp_node *rexp, int in_danger, - struct re_se_params *params) -#else -static int registers_on_stack(rxb, rexp, in_danger, params) -struct re_pattern_buffer *rxb; -struct rexp_node *rexp; -int in_danger; -struct re_se_params *params; -#endif -{ - if (!rexp) - return 0; - else - switch (rexp->type) { - case r_cset: - case r_data: - return 0; - case r_alternate: - case r_concat: - return (registers_on_stack(rxb, rexp->params.pair.left, - in_danger, params) - || (registers_on_stack - (rxb, rexp->params.pair.right, - in_danger, params))); - case r_opt: - return registers_on_stack(rxb, rexp->params.pair.left, 0, - params); - case r_star: - return registers_on_stack(rxb, rexp->params.pair.left, 1, - params); - case r_2phase_star: - return - (registers_on_stack(rxb, rexp->params.pair.left, 1, params) - || registers_on_stack(rxb, rexp->params.pair.right, 1, - params)); - case r_side_effect: - { - int se = (long) rexp->params.side_effect; - - if (in_danger && (se >= 0) - && (params[se].op1 > 0) - && (((enum re_side_effects) params[se].se == re_se_lparen) - || ((enum re_side_effects) params[se].se == - re_se_rparen))) return 1; - else - return 0; - } - } - - /* this should never happen */ - return 0; -} - - - -static char idempotent_complex_se[] = { -#define RX_WANT_SE_DEFS 1 -#undef RX_DEF_SE -#undef RX_DEF_CPLX_SE -#define RX_DEF_SE(IDEM, NAME, VALUE) -#define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) IDEM, -#include <regex.h> -#undef RX_DEF_SE -#undef RX_DEF_CPLX_SE -#undef RX_WANT_SE_DEFS - 23 -}; - -static char idempotent_se[] = { - 13, -#define RX_WANT_SE_DEFS 1 -#undef RX_DEF_SE -#undef RX_DEF_CPLX_SE -#define RX_DEF_SE(IDEM, NAME, VALUE) IDEM, -#define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) -#include <regex.h> -#undef RX_DEF_SE -#undef RX_DEF_CPLX_SE -#undef RX_WANT_SE_DEFS - 42 -}; - - - -#ifdef __STDC__ -static int has_any_se(struct rx *rx, struct rexp_node *rexp) -#else -static int has_any_se(rx, rexp) -struct rx *rx; -struct rexp_node *rexp; -#endif -{ - if (!rexp) - return 0; - - switch (rexp->type) { - case r_cset: - case r_data: - return 0; - - case r_side_effect: - return 1; - - case r_2phase_star: - case r_concat: - case r_alternate: - return (has_any_se(rx, rexp->params.pair.left) - || has_any_se(rx, rexp->params.pair.right)); - - case r_opt: - case r_star: - return has_any_se(rx, rexp->params.pair.left); - } - - /* this should never happen */ - return 0; -} - - - -/* This must be called AFTER `convert_hard_loops' for a given REXP. */ -#ifdef __STDC__ -static int -has_non_idempotent_epsilon_path(struct rx *rx, - struct rexp_node *rexp, - struct re_se_params *params) -#else -static int has_non_idempotent_epsilon_path(rx, rexp, params) -struct rx *rx; -struct rexp_node *rexp; -struct re_se_params *params; -#endif -{ - if (!rexp) - return 0; - - switch (rexp->type) { - case r_cset: - case r_data: - case r_star: - return 0; - - case r_side_effect: - return - !((long) rexp->params.side_effect > 0 - ? - idempotent_complex_se[params - [(long) rexp->params. - side_effect].se] : - idempotent_se[-(long) rexp->params.side_effect]); - - case r_alternate: - return - (has_non_idempotent_epsilon_path(rx, - rexp->params.pair.left, - params) - || has_non_idempotent_epsilon_path(rx, - rexp->params.pair.right, - params)); - - case r_2phase_star: - case r_concat: - return - (has_non_idempotent_epsilon_path(rx, - rexp->params.pair.left, - params) - && has_non_idempotent_epsilon_path(rx, - rexp->params.pair.right, - params)); - - case r_opt: - return has_non_idempotent_epsilon_path(rx, - rexp->params.pair.left, - params); - } - - /* this should never happen */ - return 0; -} - - - -/* This computes rougly what it's name suggests. It can (and does) go wrong - * in the direction of returning spurious 0 without causing disasters. - */ -#ifdef __STDC__ -static int begins_with_complex_se(struct rx *rx, struct rexp_node *rexp) -#else -static int begins_with_complex_se(rx, rexp) -struct rx *rx; -struct rexp_node *rexp; -#endif -{ - if (!rexp) - return 0; - - switch (rexp->type) { - case r_cset: - case r_data: - return 0; - - case r_side_effect: - return ((long) rexp->params.side_effect >= 0); - - case r_alternate: - return (begins_with_complex_se(rx, rexp->params.pair.left) - && begins_with_complex_se(rx, rexp->params.pair.right)); - - - case r_concat: - return has_any_se(rx, rexp->params.pair.left); - case r_opt: - case r_star: - case r_2phase_star: - return 0; - } - - /* this should never happen */ - return 0; -} - - -/* This destructively removes some of the re_se_tv side effects from - * a rexp tree. In particular, during parsing re_se_tv was inserted on the - * right half of every | to guarantee that posix path preference could be - * honored. This function removes some which it can be determined aren't - * needed. - */ - -#ifdef __STDC__ -static void -speed_up_alt(struct rx *rx, struct rexp_node *rexp, int unposix) -#else -static void speed_up_alt(rx, rexp, unposix) -struct rx *rx; -struct rexp_node *rexp; -int unposix; -#endif -{ - if (!rexp) - return; - - switch (rexp->type) { - case r_cset: - case r_data: - case r_side_effect: - return; - - case r_opt: - case r_star: - speed_up_alt(rx, rexp->params.pair.left, unposix); - return; - - case r_2phase_star: - case r_concat: - speed_up_alt(rx, rexp->params.pair.left, unposix); - speed_up_alt(rx, rexp->params.pair.right, unposix); - return; - - case r_alternate: - /* the right child is guaranteed to be (concat re_se_tv <subexp>) */ - - speed_up_alt(rx, rexp->params.pair.left, unposix); - speed_up_alt(rx, rexp->params.pair.right->params.pair.right, - unposix); - - if (unposix - || (begins_with_complex_se - (rx, rexp->params.pair.right->params.pair.right)) - || !(has_any_se(rx, rexp->params.pair.right->params.pair.right) - || has_any_se(rx, rexp->params.pair.left))) { - struct rexp_node *conc = rexp->params.pair.right; - - rexp->params.pair.right = conc->params.pair.right; - conc->params.pair.right = 0; - rx_free_rexp(rx, conc); - } - } -} - - - - - -/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. - Returns one of error codes defined in `regex.h', or zero for success. - - Assumes the `allocated' (and perhaps `buffer') and `translate' - fields are set in BUFP on entry. - - If it succeeds, results are put in BUFP (if it returns an error, the - contents of BUFP are undefined): - `buffer' is the compiled pattern; - `syntax' is set to SYNTAX; - `used' is set to the length of the compiled pattern; - `fastmap_accurate' is set to zero; - `re_nsub' is set to the number of groups in PATTERN; - `not_bol' and `not_eol' are set to zero. - - The `fastmap' and `newline_anchor' fields are neither - examined nor set. */ - - -#ifdef __STDC__ -RX_DECL reg_errcode_t -rx_compile(__const__ char *pattern, int size, - reg_syntax_t syntax, struct re_pattern_buffer *rxb) -#else -RX_DECL reg_errcode_t rx_compile(pattern, size, syntax, rxb) -__const__ char *pattern; -int size; -reg_syntax_t syntax; -struct re_pattern_buffer *rxb; -#endif -{ - RX_subset - inverse_translate[CHAR_SET_SIZE * - rx_bitset_numb_subsets(CHAR_SET_SIZE)]; - char validate_inv_tr[CHAR_SET_SIZE * - - rx_bitset_numb_subsets(CHAR_SET_SIZE)]; - - /* We fetch characters from PATTERN here. Even though PATTERN is - `char *' (i.e., signed), we declare these variables as unsigned, so - they can be reliably used as array indices. */ - register unsigned char c, c1; - - /* A random tempory spot in PATTERN. */ - __const__ char *p1; - - /* Keeps track of unclosed groups. */ - compile_stack_type compile_stack; - - /* Points to the current (ending) position in the pattern. */ - __const__ char *p = pattern; - __const__ char *pend = pattern + size; - - /* How to translate the characters in the pattern. */ - unsigned char *translate = (rxb->translate - - ? rxb->translate : rx_id_translation); - - /* When parsing is done, this will hold the expression tree. */ - struct rexp_node *rexp = 0; - - /* In the midst of compilation, this holds onto the regexp - * first parst while rexp goes on to aquire additional constructs. - */ - struct rexp_node *orig_rexp = 0; - struct rexp_node *fewer_side_effects = 0; - - /* This and top_expression are saved on the compile stack. */ - struct rexp_node **top_expression = &rexp; - struct rexp_node **last_expression = top_expression; - - /* Parameter to `goto append_node' */ - struct rexp_node *append; - - /* Counts open-groups as they are encountered. This is the index of the - * innermost group being compiled. - */ - regnum_t regnum = 0; - - /* Place in the uncompiled pattern (i.e., the {) to - * which to go back if the interval is invalid. - */ - __const__ char *beg_interval; - - struct re_se_params *params = 0; - int paramc = 0; /* How many complex side effects so far? */ - - rx_side_effect side; /* param to `goto add_side_effect' */ - - bzero(validate_inv_tr, sizeof(validate_inv_tr)); - - rxb->rx.instruction_table = rx_id_instruction_table; - - - /* Initialize the compile stack. */ - compile_stack.stack = ((compile_stack_elt_t *) - malloc((INIT_COMPILE_STACK_SIZE) * - sizeof(compile_stack_elt_t))); - if (compile_stack.stack == 0) - return REG_ESPACE; - - compile_stack.size = INIT_COMPILE_STACK_SIZE; - compile_stack.avail = 0; - - /* Initialize the pattern buffer. */ - rxb->rx.cache = &default_cache; - rxb->syntax = syntax; - rxb->fastmap_accurate = 0; - rxb->not_bol = rxb->not_eol = 0; - rxb->least_subs = 0; - - /* Always count groups, whether or not rxb->no_sub is set. - * The whole pattern is implicitly group 0, so counting begins - * with 1. - */ - rxb->re_nsub = 0; - -#if !defined (emacs) && !defined (SYNTAX_TABLE) - /* Initialize the syntax table. */ - init_syntax_once(); -#endif - - /* Loop through the uncompiled pattern until we're at the end. */ - while (p != pend) { - PATFETCH(c); - - switch (c) { - case '^': - { - if ( /* If at start of pattern, it's an operator. */ - p == pattern + 1 - /* If context independent, it's an operator. */ - || syntax & RE_CONTEXT_INDEP_ANCHORS - /* Otherwise, depends on what's come before. */ - || at_begline_loc_p(pattern, p, syntax)) { - struct rexp_node *n = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) re_se_hat); - - if (!n) - return REG_ESPACE; - append = n; - goto append_node; - } else - goto normal_char; - } - break; - - - case '$': - { - if ( /* If at end of pattern, it's an operator. */ - p == pend - /* If context independent, it's an operator. */ - || syntax & RE_CONTEXT_INDEP_ANCHORS - /* Otherwise, depends on what's next. */ - || at_endline_loc_p(p, pend, syntax)) { - struct rexp_node *n = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) re_se_dollar); - - if (!n) - return REG_ESPACE; - append = n; - goto append_node; - } else - goto normal_char; - } - break; - - - case '+': - case '?': - if ((syntax & RE_BK_PLUS_QM) - || (syntax & RE_LIMITED_OPS)) - goto normal_char; - - handle_plus: - case '*': - /* If there is no previous pattern... */ - if (pointless_if_repeated(*last_expression, params)) { - if (syntax & RE_CONTEXT_INVALID_OPS) - return REG_BADRPT; - else if (!(syntax & RE_CONTEXT_INDEP_OPS)) - goto normal_char; - } - - { - /* 1 means zero (many) matches is allowed. */ - char zero_times_ok = 0, many_times_ok = 0; - - /* If there is a sequence of repetition chars, collapse it - down to just one (the right one). We can't combine - interval operators with these because of, e.g., `a{2}*', - which should only match an even number of `a's. */ - - for (;;) { - zero_times_ok |= c != '+'; - many_times_ok |= c != '?'; - - if (p == pend) - break; - - PATFETCH(c); - - if (c == '*' || (!(syntax & RE_BK_PLUS_QM) - && (c == '+' || c == '?'))); - - else if (syntax & RE_BK_PLUS_QM && c == '\\') { - if (p == pend) - return REG_EESCAPE; - - PATFETCH(c1); - if (!(c1 == '+' || c1 == '?')) { - PATUNFETCH; - PATUNFETCH; - break; - } - - c = c1; - } else { - PATUNFETCH; - break; - } - - /* If we get here, we found another repeat character. */ - } - - /* Star, etc. applied to an empty pattern is equivalent - to an empty pattern. */ - if (!last_expression) - break; - - /* Now we know whether or not zero matches is allowed - * and also whether or not two or more matches is allowed. - */ - - { - struct rexp_node *inner_exp = *last_expression; - int need_sync = 0; - - if (many_times_ok - && has_non_idempotent_epsilon_path(&rxb->rx, - inner_exp, - params)) { - struct rexp_node *pusher = - rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) re_se_pushpos); - struct rexp_node *checker - = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) re_se_chkpos); - struct rexp_node *pushback - = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) re_se_pushback); - rx_Bitset cs = rx_cset(&rxb->rx); - struct rexp_node *lit_t; - struct rexp_node *fake_state; - struct rexp_node *phase2; - struct rexp_node *popper; - struct rexp_node *star; - struct rexp_node *a; - struct rexp_node *whole_thing; - - if (!cs) - return REG_ESPACE; - lit_t = rx_mk_r_cset(&rxb->rx, cs); - fake_state = - rx_mk_r_concat(&rxb->rx, pushback, lit_t); - phase2 = - rx_mk_r_concat(&rxb->rx, checker, fake_state); - popper = - rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) - re_se_poppos); - star = - rx_mk_r_2phase_star(&rxb->rx, inner_exp, - phase2); - a = rx_mk_r_concat(&rxb->rx, pusher, star); - whole_thing = rx_mk_r_concat(&rxb->rx, a, popper); - - if (! - (pusher && star && pushback && lit_t - && fake_state && lit_t && phase2 && checker - && popper && a && whole_thing)) - return REG_ESPACE; - RX_bitset_enjoin(cs, 't'); - *last_expression = whole_thing; - } else { - struct rexp_node *star = - (many_times_ok ? rx_mk_r_star : rx_mk_r_opt) - (&rxb->rx, *last_expression); - - if (!star) - return REG_ESPACE; - *last_expression = star; - need_sync = has_any_se(&rxb->rx, *last_expression); - } - if (!zero_times_ok) { - struct rexp_node *concat - = rx_mk_r_concat(&rxb->rx, inner_exp, - rx_copy_rexp(&rxb->rx, - *last_expression)); - - if (!concat) - return REG_ESPACE; - *last_expression = concat; - } - if (need_sync) { - int sync_se = paramc; - - params = (params ? ((struct re_se_params *) - realloc(params, - sizeof(*params) * (1 + - paramc))) - : ((struct re_se_params *) - malloc(sizeof(*params)))); - if (!params) - return REG_ESPACE; - ++paramc; - params[sync_se].se = re_se_tv; - side = (rx_side_effect) sync_se; - goto add_side_effect; - } - } - /* The old regex.c used to optimize `.*\n'. - * Maybe rx should too? - */ - } - break; - - - case '.': - { - rx_Bitset cs = rx_cset(&rxb->rx); - struct rexp_node *n = rx_mk_r_cset(&rxb->rx, cs); - - if (!(cs && n)) - return REG_ESPACE; - - rx_bitset_universe(rxb->rx.local_cset_size, cs); - if (!(rxb->syntax & RE_DOT_NEWLINE)) - RX_bitset_remove(cs, '\n'); - if (!(rxb->syntax & RE_DOT_NOT_NULL)) - RX_bitset_remove(cs, 0); - - append = n; - goto append_node; - break; - } - - - case '[': - if (p == pend) - return REG_EBRACK; - { - boolean had_char_class = false; - rx_Bitset cs = rx_cset(&rxb->rx); - struct rexp_node *node = rx_mk_r_cset(&rxb->rx, cs); - int is_inverted = *p == '^'; - - if (!(node && cs)) - return REG_ESPACE; - - /* This branch of the switch is normally exited with - *`goto append_node' - */ - append = node; - - if (is_inverted) - p++; - - /* Remember the first position in the bracket expression. */ - p1 = p; - - /* Read in characters and ranges, setting map bits. */ - for (;;) { - if (p == pend) - return REG_EBRACK; - - PATFETCH(c); - - /* \ might escape characters inside [...] and [^...]. */ - if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) - && c == '\\') { - if (p == pend) - return REG_EESCAPE; - - PATFETCH(c1); - { - rx_Bitset it = inverse_translation(rxb, - validate_inv_tr, - inverse_translate, - translate, - c1); - - rx_bitset_union(rxb->rx.local_cset_size, cs, - it); - } - continue; - } - - /* Could be the end of the bracket expression. If it's - not (i.e., when the bracket expression is `[]' so - far), the ']' character bit gets set way below. */ - if (c == ']' && p != p1 + 1) - goto finalize_class_and_append; - - /* Look ahead to see if it's a range when the last thing - was a character class. */ - if (had_char_class && c == '-' && *p != ']') - return REG_ERANGE; - - /* Look ahead to see if it's a range when the last thing - was a character: if this is a hyphen not at the - beginning or the end of a list, then it's the range - operator. */ - if (c == '-' && !(p - 2 >= pattern && p[-2] == '[') - && !(p - 3 >= pattern && p[-3] == '[' - && p[-2] == '^') && *p != ']') { - reg_errcode_t ret = - compile_range(rxb, cs, &p, pend, translate, - syntax, - inverse_translate, - - validate_inv_tr); - - if (ret != REG_NOERROR) - return ret; - } - - else if (p[0] == '-' && p[1] != ']') { /* This handles ranges made up of characters only. */ - reg_errcode_t ret; - - /* Move past the `-'. */ - PATFETCH(c1); - - ret = - compile_range(rxb, cs, &p, pend, translate, - syntax, inverse_translate, - validate_inv_tr); - if (ret != REG_NOERROR) - return ret; - } - - /* See if we're at the beginning of a possible character - class. */ - - else if ((syntax & RE_CHAR_CLASSES) - && (c == '[') && (*p == ':')) { - char str[CHAR_CLASS_MAX_LENGTH + 1]; - - PATFETCH(c); - c1 = 0; - - /* If pattern is `[[:'. */ - if (p == pend) - return REG_EBRACK; - - for (;;) { - PATFETCH(c); - if (c == ':' || c == ']' || p == pend - || c1 == CHAR_CLASS_MAX_LENGTH) break; - str[c1++] = c; - } - str[c1] = '\0'; - - /* If isn't a word bracketed by `[:' and:`]': - undo the ending character, the letters, and leave - the leading `:' and `[' (but set bits for them). */ - if (c == ':' && *p == ']') { - int ch; - boolean is_alnum = !strcmp(str, "alnum"); - boolean is_alpha = !strcmp(str, "alpha"); - boolean is_blank = !strcmp(str, "blank"); - boolean is_cntrl = !strcmp(str, "cntrl"); - boolean is_digit = !strcmp(str, "digit"); - boolean is_graph = !strcmp(str, "graph"); - boolean is_lower = !strcmp(str, "lower"); - boolean is_print = !strcmp(str, "print"); - boolean is_punct = !strcmp(str, "punct"); - boolean is_space = !strcmp(str, "space"); - boolean is_upper = !strcmp(str, "upper"); - boolean is_xdigit = !strcmp(str, "xdigit"); - - if (!IS_CHAR_CLASS(str)) - return REG_ECTYPE; - - /* Throw away the ] at the end of the character - class. */ - PATFETCH(c); - - if (p == pend) - return REG_EBRACK; - - for (ch = 0; ch < 1 << CHARBITS; ch++) { - if ((is_alnum && isalnum(ch)) - || (is_alpha && isalpha(ch)) - || (is_blank && isblank(ch)) - || (is_cntrl && iscntrl(ch)) - || (is_digit && isdigit(ch)) - || (is_graph && isgraph(ch)) - || (is_lower && islower(ch)) - || (is_print && isprint(ch)) - || (is_punct && ispunct(ch)) - || (is_space && isspace(ch)) - || (is_upper && isupper(ch)) - || (is_xdigit && isxdigit(ch))) { - rx_Bitset it = inverse_translation(rxb, - validate_inv_tr, - inverse_translate, - translate, - ch); - - rx_bitset_union(rxb-> - rx.local_cset_size, cs, - it); - } - } - had_char_class = true; - } else { - c1++; - while (c1--) - PATUNFETCH; - { - rx_Bitset it = inverse_translation(rxb, - validate_inv_tr, - inverse_translate, - translate, - '['); - - rx_bitset_union(rxb->rx.local_cset_size, - cs, it); - } - { - rx_Bitset it = inverse_translation(rxb, - validate_inv_tr, - inverse_translate, - translate, - ':'); - - rx_bitset_union(rxb->rx.local_cset_size, - cs, it); - } - had_char_class = false; - } - } else { - had_char_class = false; - { - rx_Bitset it = inverse_translation(rxb, - validate_inv_tr, - inverse_translate, - translate, - c); - - rx_bitset_union(rxb->rx.local_cset_size, cs, - it); - } - } - } - - finalize_class_and_append: - if (is_inverted) { - rx_bitset_complement(rxb->rx.local_cset_size, cs); - if (syntax & RE_HAT_LISTS_NOT_NEWLINE) - RX_bitset_remove(cs, '\n'); - } - goto append_node; - } - break; - - - case '(': - if (syntax & RE_NO_BK_PARENS) - goto handle_open; - else - goto normal_char; - - - case ')': - if (syntax & RE_NO_BK_PARENS) - goto handle_close; - else - goto normal_char; - - - case '\n': - if (syntax & RE_NEWLINE_ALT) - goto handle_alt; - else - goto normal_char; - - - case '|': - if (syntax & RE_NO_BK_VBAR) - goto handle_alt; - else - goto normal_char; - - - case '{': - if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) - goto handle_interval; - else - goto normal_char; - - - case '\\': - if (p == pend) - return REG_EESCAPE; - - /* Do not translate the character after the \, so that we can - distinguish, e.g., \B from \b, even if we normally would - translate, e.g., B to b. */ - PATFETCH_RAW(c); - - switch (c) { - case '(': - if (syntax & RE_NO_BK_PARENS) - goto normal_backslash; - - handle_open: - rxb->re_nsub++; - regnum++; - if (COMPILE_STACK_FULL) { - ((compile_stack.stack) = - (compile_stack_elt_t *) realloc(compile_stack.stack, - (compile_stack.size << - 1) * - sizeof - (compile_stack_elt_t))); - if (compile_stack.stack == 0) - return REG_ESPACE; - - compile_stack.size <<= 1; - } - - if (*last_expression) { - struct rexp_node *concat - = rx_mk_r_concat(&rxb->rx, *last_expression, 0); - - if (!concat) - return REG_ESPACE; - *last_expression = concat; - last_expression = &concat->params.pair.right; - } - - /* - * These are the values to restore when we hit end of this - * group. - */ - COMPILE_STACK_TOP.top_expression = top_expression; - COMPILE_STACK_TOP.last_expression = last_expression; - COMPILE_STACK_TOP.regnum = regnum; - - compile_stack.avail++; - - top_expression = last_expression; - break; - - - case ')': - if (syntax & RE_NO_BK_PARENS) - goto normal_backslash; - - handle_close: - /* See similar code for backslashed left paren above. */ - if (COMPILE_STACK_EMPTY) { - if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) { - goto normal_char; - } else { - return REG_ERPAREN; - } - } - - /* Since we just checked for an empty stack above, this - ``can't happen''. */ - - { - /* We don't just want to restore into `regnum', because - later groups should continue to be numbered higher, - as in `(ab)c(de)' -- the second group is #2. */ - regnum_t this_group_regnum; - struct rexp_node **inner = top_expression; - - compile_stack.avail--; - top_expression = COMPILE_STACK_TOP.top_expression; - last_expression = COMPILE_STACK_TOP.last_expression; - this_group_regnum = COMPILE_STACK_TOP.regnum; - { - int left_se = paramc; - int right_se = paramc + 1; - - params = (params ? ((struct re_se_params *) - realloc(params, - (paramc + - 2) * - sizeof(params[0]))) - : ((struct re_se_params *) - malloc(2 * sizeof(params[0])))); - if (!params) - return REG_ESPACE; - paramc += 2; - - params[left_se].se = re_se_lparen; - params[left_se].op1 = this_group_regnum; - params[right_se].se = re_se_rparen; - params[right_se].op1 = this_group_regnum; - { - struct rexp_node *left - = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) left_se); - struct rexp_node *right - = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) right_se); - struct rexp_node *c1 - = (*inner ? rx_mk_r_concat(&rxb->rx, left, - *inner) : left); - struct rexp_node *c2 = - rx_mk_r_concat(&rxb->rx, c1, right); - - if (!(left && right && c1 && c2)) - return REG_ESPACE; - *inner = c2; - } - } - break; - } - - case '|': /* `\|'. */ - if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR)) - goto normal_backslash; - handle_alt: - if (syntax & RE_LIMITED_OPS) - goto normal_char; - - { - struct rexp_node *alt - = rx_mk_r_alternate(&rxb->rx, *top_expression, 0); - - if (!alt) - return REG_ESPACE; - *top_expression = alt; - last_expression = &alt->params.pair.right; - { - int sync_se = paramc; - - params = (params ? ((struct re_se_params *) - realloc(params, - (paramc + - 1) * - sizeof(params[0]))) - : ((struct re_se_params *) - malloc(sizeof(params[0])))); - if (!params) - return REG_ESPACE; - ++paramc; - - params[sync_se].se = re_se_tv; - { - struct rexp_node *sync - = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) sync_se); - struct rexp_node *conc - = rx_mk_r_concat(&rxb->rx, sync, 0); - - if (!sync || !conc) - return REG_ESPACE; - - *last_expression = conc; - last_expression = &conc->params.pair.right; - } - } - } - break; - - - case '{': - /* If \{ is a literal. */ - if (!(syntax & RE_INTERVALS) - /* If we're at `\{' and it's not the open-interval - operator. */ - || ((syntax & RE_INTERVALS) - && (syntax & RE_NO_BK_BRACES)) || (p - 2 == pattern - && p == pend)) - goto normal_backslash; - - handle_interval: - { - /* If got here, then the syntax allows intervals. */ - - /* At least (most) this many matches must be made. */ - int lower_bound = -1, upper_bound = -1; - - beg_interval = p - 1; - - if (p == pend) { - if (syntax & RE_NO_BK_BRACES) - goto unfetch_interval; - else - return REG_EBRACE; - } - - GET_UNSIGNED_NUMBER(lower_bound); - - if (c == ',') { - GET_UNSIGNED_NUMBER(upper_bound); - if (upper_bound < 0) - upper_bound = RE_DUP_MAX; - } else - /* Interval such as `{1}' => match exactly once. */ - upper_bound = lower_bound; - - if (lower_bound < 0 || upper_bound > RE_DUP_MAX - || lower_bound > upper_bound) { - if (syntax & RE_NO_BK_BRACES) - goto unfetch_interval; - else - return REG_BADBR; - } - - if (!(syntax & RE_NO_BK_BRACES)) { - if (c != '\\') - return REG_EBRACE; - PATFETCH(c); - } - - if (c != '}') { - if (syntax & RE_NO_BK_BRACES) - goto unfetch_interval; - else - return REG_BADBR; - } - - /* We just parsed a valid interval. */ - - /* If it's invalid to have no preceding re. */ - if (pointless_if_repeated(*last_expression, params)) { - if (syntax & RE_CONTEXT_INVALID_OPS) - return REG_BADRPT; - else if (!(syntax & RE_CONTEXT_INDEP_OPS)) - goto unfetch_interval; - /* was: else laststart = b; */ - } - - /* If the upper bound is zero, don't want to iterate - * at all. - */ - if (upper_bound == 0) { - if (*last_expression) { - rx_free_rexp(&rxb->rx, *last_expression); - *last_expression = 0; - } - } else - /* Otherwise, we have a nontrivial interval. */ - { - int iter_se = paramc; - int end_se = paramc + 1; - - params = (params ? ((struct re_se_params *) - realloc(params, - sizeof(*params) * (2 + - paramc))) - : ((struct re_se_params *) - malloc(2 * sizeof(*params)))); - if (!params) - return REG_ESPACE; - paramc += 2; - params[iter_se].se = re_se_iter; - params[iter_se].op1 = lower_bound; - params[iter_se].op2 = upper_bound; - - params[end_se].se = re_se_end_iter; - params[end_se].op1 = lower_bound; - params[end_se].op2 = upper_bound; - { - struct rexp_node *push0 - = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) re_se_push0); - struct rexp_node *start_one_iter - = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) iter_se); - struct rexp_node *phase1 - = rx_mk_r_concat(&rxb->rx, start_one_iter, - *last_expression); - struct rexp_node *pushback - = rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) re_se_pushback); - rx_Bitset cs = rx_cset(&rxb->rx); - struct rexp_node *lit_t; - struct rexp_node *phase2; - struct rexp_node *loop; - struct rexp_node *push_n_loop; - struct rexp_node *final_test; - struct rexp_node *full_exp; - - if (!cs) - return REG_ESPACE; - lit_t = rx_mk_r_cset(&rxb->rx, cs); - phase2 = - rx_mk_r_concat(&rxb->rx, pushback, lit_t); - loop = - rx_mk_r_2phase_star(&rxb->rx, phase1, - phase2); - push_n_loop = - rx_mk_r_concat(&rxb->rx, push0, loop); - final_test = - rx_mk_r_side_effect(&rxb->rx, - (rx_side_effect) - end_se); - full_exp = - rx_mk_r_concat(&rxb->rx, push_n_loop, - final_test); - - if (!(push0 && start_one_iter && phase1 - && pushback && lit_t && phase2 - && loop && push_n_loop && final_test - && full_exp)) return REG_ESPACE; - - RX_bitset_enjoin(cs, 't'); - - *last_expression = full_exp; - } - } - beg_interval = 0; - } - break; - - unfetch_interval: - /* If an invalid interval, match the characters as literals. */ - p = beg_interval; - beg_interval = 0; - - /* normal_char and normal_backslash need `c'. */ - PATFETCH(c); - - if (!(syntax & RE_NO_BK_BRACES)) { - if (p > pattern && p[-1] == '\\') - goto normal_backslash; - } - goto normal_char; - -#ifdef emacs - /* There is no way to specify the before_dot and after_dot - operators. rms says this is ok. --karl */ - case '=': - side = (rx_side_effect) rx_se_at_dot; - goto add_side_effect; - break; - - case 's': - case 'S': - { - rx_Bitset cs = rx_cset(&rxb->rx); - struct rexp_node *set = rx_mk_r_cset(&rxb->rx, cs); - - if (!(cs && set)) - return REG_ESPACE; - if (c == 'S') - rx_bitset_universe(rxb->rx.local_cset_size, cs); - - PATFETCH(c); - { - int x; - enum syntaxcode code = syntax_spec_code[c]; - - for (x = 0; x < 256; ++x) { - - if (SYNTAX(x) == code) { - rx_Bitset it = - inverse_translation(rxb, validate_inv_tr, - inverse_translate, - translate, x); - - rx_bitset_xor(rxb->rx.local_cset_size, cs, it); - } - } - } - append = set; - goto append_node; - } - break; -#endif /* emacs */ - - - case 'w': - case 'W': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - { - rx_Bitset cs = rx_cset(&rxb->rx); - struct rexp_node *n = - (cs ? rx_mk_r_cset(&rxb->rx, cs) : 0); - - if (!(cs && n)) - return REG_ESPACE; - if (c == 'W') - rx_bitset_universe(rxb->rx.local_cset_size, cs); - { - int x; - - for (x = rxb->rx.local_cset_size - 1; x > 0; --x) - if (SYNTAX(x) & Sword) - RX_bitset_toggle(cs, x); - } - append = n; - goto append_node; - } - break; - -/* With a little extra work, some of these side effects could be optimized - * away (basicly by looking at what we already know about the surrounding - * chars). - */ - case '<': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - side = (rx_side_effect) re_se_wordbeg; - goto add_side_effect; - break; - - case '>': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - side = (rx_side_effect) re_se_wordend; - goto add_side_effect; - break; - - case 'b': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - side = (rx_side_effect) re_se_wordbound; - goto add_side_effect; - break; - - case 'B': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - side = (rx_side_effect) re_se_notwordbound; - goto add_side_effect; - break; - - case '`': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - side = (rx_side_effect) re_se_begbuf; - goto add_side_effect; - break; - - case '\'': - if (syntax & RE_NO_GNU_OPS) - goto normal_char; - side = (rx_side_effect) re_se_endbuf; - goto add_side_effect; - break; - - add_side_effect: - { - struct rexp_node *se - - = rx_mk_r_side_effect(&rxb->rx, side); - if (!se) - return REG_ESPACE; - append = se; - goto append_node; - } - break; - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - if (syntax & RE_NO_BK_REFS) - goto normal_char; - - c1 = c - '0'; - - if (c1 > regnum) - return REG_ESUBREG; - - /* Can't back reference to a subexpression if inside of it. */ - if (group_in_compile_stack(compile_stack, c1)) - return REG_ESUBREG; - - { - int backref_se = paramc; - - params = (params ? ((struct re_se_params *) - realloc(params, - sizeof(*params) * (1 + - paramc))) - : ((struct re_se_params *) - malloc(sizeof(*params)))); - if (!params) - return REG_ESPACE; - ++paramc; - params[backref_se].se = re_se_backref; - params[backref_se].op1 = c1; - side = (rx_side_effect) backref_se; - goto add_side_effect; - } - break; - - case '+': - case '?': - if (syntax & RE_BK_PLUS_QM) - goto handle_plus; - else - goto normal_backslash; - - default: - normal_backslash: - /* You might think it would be useful for \ to mean - not to translate; but if we don't translate it - it will never match anything. */ - c = TRANSLATE(c); - goto normal_char; - } - break; - - - default: - /* Expects the character in `c'. */ - normal_char: - { - rx_Bitset cs = rx_cset(&rxb->rx); - struct rexp_node *match = rx_mk_r_cset(&rxb->rx, cs); - rx_Bitset it; - - if (!(cs && match)) - return REG_ESPACE; - it = inverse_translation(rxb, validate_inv_tr, - inverse_translate, translate, c); - rx_bitset_union(CHAR_SET_SIZE, cs, it); - append = match; - - append_node: - /* This genericly appends the rexp APPEND to *LAST_EXPRESSION - * and then parses the next character normally. - */ - if (*last_expression) { - struct rexp_node *concat - = rx_mk_r_concat(&rxb->rx, *last_expression, append); - - if (!concat) - return REG_ESPACE; - *last_expression = concat; - last_expression = &concat->params.pair.right; - } else - *last_expression = append; - } - } /* switch (c) */ - } /* while p != pend */ - - - { - int win_se = paramc; - - params = (params ? ((struct re_se_params *) - realloc(params, - sizeof(*params) * (1 + paramc))) - : ((struct re_se_params *) - malloc(sizeof(*params)))); - if (!params) - return REG_ESPACE; - ++paramc; - params[win_se].se = re_se_win; - { - struct rexp_node *se - = rx_mk_r_side_effect(&rxb->rx, (rx_side_effect) win_se); - struct rexp_node *concat = rx_mk_r_concat(&rxb->rx, rexp, se); - - if (!(se && concat)) - return REG_ESPACE; - rexp = concat; - } - } - - - /* Through the pattern now. */ - - if (!COMPILE_STACK_EMPTY) - return REG_EPAREN; - - free(compile_stack.stack); - - orig_rexp = rexp; -#ifdef RX_DEBUG - if (rx_debug_compile) { - dbug_rxb = rxb; - fputs("\n\nCompiling ", stdout); - fwrite(pattern, 1, size, stdout); - fputs(":\n", stdout); - rxb->se_params = params; - print_rexp(&rxb->rx, orig_rexp, 2, re_seprint, stdout); - } -#endif - { - rx_Bitset cs = rx_cset(&rxb->rx); - rx_Bitset cs2 = rx_cset(&rxb->rx); - char *se_map = (char *) alloca(paramc); - struct rexp_node *new_rexp = 0; - - - bzero(se_map, paramc); - find_backrefs(se_map, rexp, params); - fewer_side_effects = - remove_unecessary_side_effects(&rxb->rx, se_map, - rx_copy_rexp(&rxb->rx, rexp), - params); - - speed_up_alt(&rxb->rx, rexp, 0); - speed_up_alt(&rxb->rx, fewer_side_effects, 1); - - { - char *syntax_parens = rxb->syntax_parens; - - if (syntax_parens == (char *) 0x1) - rexp = remove_unecessary_side_effects - (&rxb->rx, se_map, rexp, params); - else if (syntax_parens) { - int x; - - for (x = 0; x < paramc; ++x) - if (((params[x].se == re_se_lparen) - || (params[x].se == re_se_rparen)) - && (!syntax_parens[params[x].op1])) - se_map[x] = 1; - rexp = remove_unecessary_side_effects - (&rxb->rx, se_map, rexp, params); - } - } - - /* At least one more optimization would be nice to have here but i ran out - * of time. The idea would be to delay side effects. - * For examle, `(abc)' is the same thing as `abc()' except that the - * left paren is offset by 3 (which we know at compile time). - * (In this comment, write that second pattern `abc(:3:)' - * where `(:3:' is a syntactic unit.) - * - * Trickier: `(abc|defg)' is the same as `(abc(:3:|defg(:4:))' - * (The paren nesting may be hard to follow -- that's an alternation - * of `abc(:3:' and `defg(:4:' inside (purely syntactic) parens - * followed by the closing paren from the original expression.) - * - * Neither the expression tree representation nor the the nfa make - * this very easy to write. :( - */ - - /* What we compile is different than what the parser returns. - * Suppose the parser returns expression R. - * Let R' be R with unnecessary register assignments removed - * (see REMOVE_UNECESSARY_SIDE_EFFECTS, above). - * - * What we will compile is the expression: - * - * m{try}R{win}\|s{try}R'{win} - * - * {try} and {win} denote side effect epsilons (see EXPLORE_FUTURE). - * - * When trying a match, we insert an `m' at the beginning of the - * string if the user wants registers to be filled, `s' if not. - */ - new_rexp = - rx_mk_r_alternate - (&rxb->rx, - rx_mk_r_concat(&rxb->rx, rx_mk_r_cset(&rxb->rx, cs2), rexp), - rx_mk_r_concat(&rxb->rx, - rx_mk_r_cset(&rxb->rx, cs), - fewer_side_effects)); - - if (!(new_rexp && cs && cs2)) - return REG_ESPACE; - RX_bitset_enjoin(cs2, '\0'); /* prefixed to the rexp used for matching. */ - RX_bitset_enjoin(cs, '\1'); /* prefixed to the rexp used for searching. */ - rexp = new_rexp; - } - -#ifdef RX_DEBUG - if (rx_debug_compile) { - fputs("\n...which is compiled as:\n", stdout); - print_rexp(&rxb->rx, rexp, 2, re_seprint, stdout); - } -#endif - { - struct rx_nfa_state *start = 0; - struct rx_nfa_state *end = 0; - - if (!rx_build_nfa(&rxb->rx, rexp, &start, &end)) - return REG_ESPACE; /* */ - else { - void *mem = (void *) rxb->buffer; - unsigned long size = rxb->allocated; - int start_id; - char *perm_mem; - int iterator_size = paramc * sizeof(params[0]); - - end->is_final = 1; - start->is_start = 1; - rx_name_nfa_states(&rxb->rx); - start_id = start->id; -#ifdef RX_DEBUG - if (rx_debug_compile) { - fputs("...giving the NFA: \n", stdout); - dbug_rxb = rxb; - print_nfa(&rxb->rx, rxb->rx.nfa_states, re_seprint, - stdout); - } -#endif - if (!rx_eclose_nfa(&rxb->rx)) - return REG_ESPACE; - else { - rx_delete_epsilon_transitions(&rxb->rx); - - /* For compatability reasons, we need to shove the - * compiled nfa into one chunk of malloced memory. - */ - rxb->rx.reserved = (sizeof(params[0]) * paramc - + - rx_sizeof_bitset(rxb-> - rx.local_cset_size)); -#ifdef RX_DEBUG - if (rx_debug_compile) { - dbug_rxb = rxb; - fputs("...which cooks down (uncompactified) to: \n", - stdout); - print_nfa(&rxb->rx, rxb->rx.nfa_states, re_seprint, - stdout); - } -#endif - if (!rx_compactify_nfa(&rxb->rx, &mem, &size)) - return REG_ESPACE; - rxb->buffer = mem; - rxb->allocated = size; - rxb->rx.buffer = mem; - rxb->rx.allocated = size; - perm_mem = ((char *) rxb->rx.buffer - + rxb->rx.allocated - rxb->rx.reserved); - rxb->se_params = ((struct re_se_params *) perm_mem); - bcopy(params, rxb->se_params, iterator_size); - perm_mem += iterator_size; - rxb->fastset = (rx_Bitset) perm_mem; - rxb->start = rx_id_to_nfa_state(&rxb->rx, start_id); - } - rx_bitset_null(rxb->rx.local_cset_size, rxb->fastset); - rxb->can_match_empty = compute_fastset(rxb, orig_rexp); - rxb->match_regs_on_stack = - registers_on_stack(rxb, orig_rexp, 0, params); - rxb->search_regs_on_stack = - registers_on_stack(rxb, fewer_side_effects, 0, params); - if (rxb->can_match_empty) - rx_bitset_universe(rxb->rx.local_cset_size, rxb->fastset); - rxb->is_anchored = - is_anchored(orig_rexp, (rx_side_effect) re_se_hat); - rxb->begbuf_only = - is_anchored(orig_rexp, (rx_side_effect) re_se_begbuf); - } - rx_free_rexp(&rxb->rx, rexp); - if (params) - free(params); -#ifdef RX_DEBUG - if (rx_debug_compile) { - dbug_rxb = rxb; - fputs("...which cooks down to: \n", stdout); - print_nfa(&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout); - } -#endif - } - return REG_NOERROR; -} - - - -/* This table gives an error message for each of the error codes listed - in regex.h. Obviously the order here has to be same as there. */ - -__const__ char *rx_error_msg[] = { 0, /* REG_NOERROR */ - "No match", /* REG_NOMATCH */ - "Invalid regular expression", /* REG_BADPAT */ - "Invalid collation character", /* REG_ECOLLATE */ - "Invalid character class name", /* REG_ECTYPE */ - "Trailing backslash", /* REG_EESCAPE */ - "Invalid back reference", /* REG_ESUBREG */ - "Unmatched [ or [^", /* REG_EBRACK */ - "Unmatched ( or \\(", /* REG_EPAREN */ - "Unmatched \\{", /* REG_EBRACE */ - "Invalid content of \\{\\}", /* REG_BADBR */ - "Invalid range end", /* REG_ERANGE */ - "Memory exhausted", /* REG_ESPACE */ - "Invalid preceding regular expression", /* REG_BADRPT */ - "Premature end of regular expression", /* REG_EEND */ - "Regular expression too big", /* REG_ESIZE */ - "Unmatched ) or \\)", /* REG_ERPAREN */ -}; - - - - -char rx_slowmap[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -}; - -#ifdef __STDC__ -RX_DECL void rx_blow_up_fastmap(struct re_pattern_buffer *rxb) -#else -RX_DECL void rx_blow_up_fastmap(rxb) -struct re_pattern_buffer *rxb; -#endif -{ - int x; - - for (x = 0; x < 256; ++x) /* &&&& 3.6 % */ - rxb->fastmap[x] = !!RX_bitset_member(rxb->fastset, x); - rxb->fastmap_accurate = 1; -} - - - - -#if !defined(REGEX_MALLOC) && !defined(__GNUC__) -#define RE_SEARCH_2_FN inner_re_search_2 -#define RE_S2_QUAL static -#else -#define RE_SEARCH_2_FN re_search_2 -#define RE_S2_QUAL -#endif - -struct re_search_2_closure { - __const__ char *string1; - int size1; - __const__ char *string2; - int size2; -}; - -RE_S2_QUAL int -RE_SEARCH_2_FN(struct re_pattern_buffer *, - __const__ char *, - int, __const__ char *, int, int, - - int, struct re_registers *, int); -int re_rx_search(struct re_pattern_buffer *, int, - int, int, int, rx_get_burst_fn, - rx_back_check_fn, rx_fetch_char_fn, - void *, struct re_registers *, - - struct rx_search_state *, struct rx_search_state *); -#if !defined(REGEX_MALLOC) && !defined(__GNUC__) -int re_search_2(struct re_pattern_buffer *, - __const__ char *, int, - __const__ char *, int, - - int, int, struct re_registers *, int); -#endif -int re_search(struct re_pattern_buffer *, - - __const__ char *, int, int, int, struct re_registers *); -int re_match_2(struct re_pattern_buffer *, - __const__ char *, int, - __const__ char *, int, int, struct re_registers *, int); -int re_match(struct re_pattern_buffer *, - - __const__ char *, int, int, struct re_registers *); -reg_syntax_t re_set_syntax(reg_syntax_t); -void re_set_registers(struct re_pattern_buffer *, - struct re_registers *, unsigned, - regoff_t *, regoff_t *); -static int cplx_se_sublist_len(struct rx_se_list *); -static int posix_se_list_order(struct rx *, struct rx_se_list *, - - struct rx_se_list *); -__const__ char -*re_compile_pattern(__const__ char *, int, struct re_pattern_buffer *); -int re_compile_fastmap(struct re_pattern_buffer *); -char *re_comp(__const__ char *); -int re_exec(__const__ char *); -int regcomp(regex_t *, __const__ char *, int); -int regexec(__const__ regex_t *, - __const__ char *, size_t, regmatch_t pmatch[], int); -size_t regerror(int, __const__ regex_t *, char *, size_t); - -#ifdef __STDC__ -static __inline__ enum rx_get_burst_return -re_search_2_get_burst(struct rx_string_position *pos, - void *vclosure, int stop) -#else -static __inline__ enum rx_get_burst_return -re_search_2_get_burst(pos, vclosure, stop) -struct rx_string_position *pos; -void *vclosure; -int stop; -#endif -{ - struct re_search_2_closure *closure; - - closure = (struct re_search_2_closure *) vclosure; - if (!closure->string2) { - int inset; - - inset = pos->pos - pos->string; - if ((inset < -1) || (inset > closure->size1)) - return rx_get_burst_no_more; - else { - pos->pos = - (__const__ unsigned char *) closure->string1 + inset; - pos->string = (__const__ unsigned char *) closure->string1; - pos->size = closure->size1; - pos->end = ((__const__ unsigned char *) - MIN(closure->string1 + closure->size1, - closure->string1 + stop)); - pos->offset = 0; - return ((pos->pos < pos->end) - ? rx_get_burst_ok : rx_get_burst_no_more); - } - } else if (!closure->string1) { - int inset; - - inset = pos->pos - pos->string; - pos->pos = (__const__ unsigned char *) closure->string2 + inset; - pos->string = (__const__ unsigned char *) closure->string2; - pos->size = closure->size2; - pos->end = ((__const__ unsigned char *) - MIN(closure->string2 + closure->size2, - closure->string2 + stop)); - pos->offset = 0; - return ((pos->pos < pos->end) - ? rx_get_burst_ok : rx_get_burst_no_more); - } else { - int inset; - - inset = pos->pos - pos->string + pos->offset; - if (inset < closure->size1) { - pos->pos = - (__const__ unsigned char *) closure->string1 + inset; - pos->string = (__const__ unsigned char *) closure->string1; - pos->size = closure->size1; - pos->end = ((__const__ unsigned char *) - MIN(closure->string1 + closure->size1, - closure->string1 + stop)); - pos->offset = 0; - return rx_get_burst_ok; - } else { - pos->pos = ((__const__ unsigned char *) - closure->string2 + inset - closure->size1); - pos->string = (__const__ unsigned char *) closure->string2; - pos->size = closure->size2; - pos->end = ((__const__ unsigned char *) - MIN(closure->string2 + closure->size2, - closure->string2 + stop - closure->size1)); - pos->offset = closure->size1; - return ((pos->pos < pos->end) - ? rx_get_burst_ok : rx_get_burst_no_more); - } - } -} - - -#ifdef __STDC__ -static __inline__ enum rx_back_check_return -re_search_2_back_check(struct rx_string_position *pos, - int lparen, int rparen, unsigned char *translate, - void *vclosure, int stop) -#else -static __inline__ enum rx_back_check_return -re_search_2_back_check(pos, lparen, rparen, translate, vclosure, stop) -struct rx_string_position *pos; -int lparen; -int rparen; -unsigned char *translate; -void *vclosure; -int stop; -#endif -{ - struct rx_string_position there; - struct rx_string_position past; - - there = *pos; - there.pos = there.string + lparen - there.offset; - re_search_2_get_burst(&there, vclosure, stop); - - past = *pos; - past.pos = past.string + rparen - there.offset; - re_search_2_get_burst(&past, vclosure, stop); - - ++pos->pos; - re_search_2_get_burst(pos, vclosure, stop); - - while ((there.pos != past.pos) - && (pos->pos != pos->end)) - if (TRANSLATE(*there.pos) != TRANSLATE(*pos->pos)) - return rx_back_check_fail; - else { - ++there.pos; - ++pos->pos; - if (there.pos == there.end) - re_search_2_get_burst(&there, vclosure, stop); - if (pos->pos == pos->end) - re_search_2_get_burst(pos, vclosure, stop); - } - - if (there.pos != past.pos) - return rx_back_check_fail; - --pos->pos; - re_search_2_get_burst(pos, vclosure, stop); - return rx_back_check_pass; -} - -#ifdef __STDC__ -static __inline__ int -re_search_2_fetch_char(struct rx_string_position *pos, int offset, - void *app_closure, int stop) -#else -static __inline__ int -re_search_2_fetch_char(pos, offset, app_closure, stop) -struct rx_string_position *pos; -int offset; -void *app_closure; -int stop; -#endif -{ - struct re_search_2_closure *closure; - - closure = (struct re_search_2_closure *) app_closure; - if (offset == 0) { - if (pos->pos >= pos->string) - return *pos->pos; - else { - if ( - (pos->string == - (__const__ unsigned char *) closure->string2) - && (closure->string1) && (closure->size1)) - return closure->string1[closure->size1 - 1]; - else - return 0; /* sure, why not. */ - } - } - if (pos->pos == pos->end) - return *closure->string2; - else -#if 0 - return pos->pos[1]; -#else - return pos->pos[offset]; /* FIXME */ -#endif -} - -#ifdef __STDC__ -RE_S2_QUAL int -RE_SEARCH_2_FN(struct re_pattern_buffer *rxb, - __const__ char *string1, int size1, - __const__ char *string2, int size2, - int startpos, int range, - struct re_registers *regs, int stop) -#else -RE_S2_QUAL int -RE_SEARCH_2_FN(rxb, - string1, size1, string2, size2, startpos, range, regs, stop) -struct re_pattern_buffer *rxb; -__const__ char *string1; -int size1; -__const__ char *string2; -int size2; -int startpos; -int range; -struct re_registers *regs; -int stop; -#endif -{ - int answer; - struct re_search_2_closure closure; - - closure.string1 = string1; - closure.size1 = size1; - closure.string2 = string2; - closure.size2 = size2; - answer = rx_search(rxb, startpos, range, stop, size1 + size2, - re_search_2_get_burst, - re_search_2_back_check, - re_search_2_fetch_char, - (void *) &closure, regs, 0, 0); - switch (answer) { - case rx_search_continuation: - abort(); - case rx_search_error: - return -2; - case rx_search_soft_fail: - case rx_search_fail: - return -1; - default: - return answer; - } -} - -/* Export rx_search to callers outside this file. */ - -#ifdef __STDC__ -int -re_rx_search(struct re_pattern_buffer *rxb, int startpos, int range, - int stop, int total_size, rx_get_burst_fn get_burst, - rx_back_check_fn back_check, rx_fetch_char_fn fetch_char, - void *app_closure, struct re_registers *regs, - struct rx_search_state *resume_state, - struct rx_search_state *save_state) -#else -int -re_rx_search(rxb, startpos, range, stop, total_size, - get_burst, back_check, fetch_char, - app_closure, regs, resume_state, save_state) -struct re_pattern_buffer *rxb; -int startpos; -int range; -int stop; -int total_size; -rx_get_burst_fn get_burst; -rx_back_check_fn back_check; -rx_fetch_char_fn fetch_char; -void *app_closure; -struct re_registers *regs; -struct rx_search_state *resume_state; -struct rx_search_state *save_state; -#endif -{ - return rx_search(rxb, startpos, range, stop, total_size, - get_burst, back_check, fetch_char, app_closure, - regs, resume_state, save_state); -} - -#if !defined(REGEX_MALLOC) && !defined(__GNUC__) -#ifdef __STDC__ -int -re_search_2(struct re_pattern_buffer *rxb, - __const__ char *string1, int size1, - __const__ char *string2, int size2, - int startpos, int range, struct re_registers *regs, int stop) -#else -int -re_search_2(rxb, string1, size1, string2, size2, startpos, range, regs, - stop) -struct re_pattern_buffer *rxb; -__const__ char *string1; -int size1; -__const__ char *string2; -int size2; -int startpos; -int range; -struct re_registers *regs; -int stop; -#endif -{ - int ret; - - ret = inner_re_search_2(rxb, string1, size1, string2, size2, startpos, - range, regs, stop); - alloca(0); - return ret; -} -#endif - - -/* Like re_search_2, above, but only one string is specified, and - * doesn't let you say where to stop matching. - */ - -#ifdef __STDC__ -int -re_search(struct re_pattern_buffer *rxb, __const__ char *string, - int size, int startpos, int range, struct re_registers *regs) -#else -int re_search(rxb, string, size, startpos, range, regs) -struct re_pattern_buffer *rxb; -__const__ char *string; -int size; -int startpos; -int range; -struct re_registers *regs; -#endif -{ - return re_search_2(rxb, 0, 0, string, size, startpos, range, regs, - size); -} - -#ifdef __STDC__ -int -re_match_2(struct re_pattern_buffer *rxb, - __const__ char *string1, int size1, - __const__ char *string2, int size2, - int pos, struct re_registers *regs, int stop) -#else -int re_match_2(rxb, string1, size1, string2, size2, pos, regs, stop) -struct re_pattern_buffer *rxb; -__const__ char *string1; -int size1; -__const__ char *string2; -int size2; -int pos; -struct re_registers *regs; -int stop; -#endif -{ - struct re_registers some_regs; - regoff_t start; - regoff_t end; - int srch; - int save = rxb->regs_allocated; - struct re_registers *regs_to_pass = regs; - char *old_fastmap = rxb->fastmap; - - if (!regs) { - some_regs.start = &start; - some_regs.end = &end; - some_regs.num_regs = 1; - regs_to_pass = &some_regs; - rxb->regs_allocated = REGS_FIXED; - } - - rxb->fastmap = NULL; - srch = re_search_2(rxb, string1, size1, string2, size2, - pos, 1, regs_to_pass, stop); - rxb->fastmap = old_fastmap; - if (regs_to_pass != regs) - rxb->regs_allocated = save; - if (srch < 0) - return srch; - return regs_to_pass->end[0] - regs_to_pass->start[0]; -} - -/* re_match is like re_match_2 except it takes only a single string. */ - -#ifdef __STDC__ -int -re_match(struct re_pattern_buffer *rxb, - __const__ char *string, - int size, int pos, struct re_registers *regs) -#else -int re_match(rxb, string, size, pos, regs) -struct re_pattern_buffer *rxb; -__const__ char *string; -int size; -int pos; -struct re_registers *regs; -#endif -{ - return re_match_2(rxb, string, size, 0, 0, pos, regs, size); -} - - - -/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can - also be assigned to arbitrarily: each pattern buffer stores its own - syntax, so it can be changed between regex compilations. */ -reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; - - -/* Specify the precise syntax of regexps for compilation. This provides - for compatibility for various utilities which historically have - different, incompatible syntaxes. - - The argument SYNTAX is a bit mask comprised of the various bits - defined in regex.h. We return the old syntax. */ - -#ifdef __STDC__ -reg_syntax_t re_set_syntax(reg_syntax_t syntax) -#else -reg_syntax_t re_set_syntax(syntax) -reg_syntax_t syntax; -#endif -{ - reg_syntax_t ret = re_syntax_options; - - re_syntax_options = syntax; - return ret; -} - - -/* Set REGS to hold NUM_REGS registers, storing them in STARTS and - ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use - this memory for recording register information. STARTS and ENDS - must be allocated using the malloc library routine, and must each - be at least NUM_REGS * sizeof (regoff_t) bytes long. - - If NUM_REGS == 0, then subsequent matches should allocate their own - register data. - - Unless this function is called, the first search or match using - PATTERN_BUFFER will allocate its own register data, without - freeing the old data. */ - -#ifdef __STDC__ -void -re_set_registers(struct re_pattern_buffer *bufp, - struct re_registers *regs, - unsigned num_regs, regoff_t * starts, regoff_t * ends) -#else -void re_set_registers(bufp, regs, num_regs, starts, ends) -struct re_pattern_buffer *bufp; -struct re_registers *regs; -unsigned num_regs; -regoff_t *starts; -regoff_t *ends; -#endif -{ - if (num_regs) { - bufp->regs_allocated = REGS_REALLOCATE; - regs->num_regs = num_regs; - regs->start = starts; - regs->end = ends; - } else { - bufp->regs_allocated = REGS_UNALLOCATED; - regs->num_regs = 0; - regs->start = regs->end = (regoff_t) 0; - } -} - - - - -#ifdef __STDC__ -static int cplx_se_sublist_len(struct rx_se_list *list) -#else -static int cplx_se_sublist_len(list) -struct rx_se_list *list; -#endif -{ - int x = 0; - - while (list) { - if ((long) list->car >= 0) - ++x; - list = list->cdr; - } - return x; -} - - -/* For rx->se_list_cmp */ - -#ifdef __STDC__ -static int -posix_se_list_order(struct rx *rx, - struct rx_se_list *a, struct rx_se_list *b) -#else -static int posix_se_list_order(rx, a, b) -struct rx *rx; -struct rx_se_list *a; -struct rx_se_list *b; -#endif -{ - int al = cplx_se_sublist_len(a); - int bl = cplx_se_sublist_len(b); - - if (!al && !bl) - return ((a == b) - ? 0 : ((a < b) ? -1 : 1)); - - else if (!al) - return -1; - - else if (!bl) - return 1; - - else { - rx_side_effect *av = ((rx_side_effect *) - alloca(sizeof(rx_side_effect) * (al + 1))); - rx_side_effect *bv = ((rx_side_effect *) - alloca(sizeof(rx_side_effect) * (bl + 1))); - struct rx_se_list *ap = a; - struct rx_se_list *bp = b; - int ai, bi; - - for (ai = al - 1; ai >= 0; --ai) { - while ((long) ap->car < 0) - ap = ap->cdr; - av[ai] = ap->car; - ap = ap->cdr; - } - av[al] = (rx_side_effect) - 2; - for (bi = bl - 1; bi >= 0; --bi) { - while ((long) bp->car < 0) - bp = bp->cdr; - bv[bi] = bp->car; - bp = bp->cdr; - } - bv[bl] = (rx_side_effect) - 1; - - { - int ret; - int x = 0; - - while (av[x] == bv[x]) - ++x; - ret = (((unsigned *) (av[x]) < (unsigned *) (bv[x])) ? -1 : 1); - return ret; - } - } -} - - - - -/* re_compile_pattern is the GNU regular expression compiler: it - compiles PATTERN (of length SIZE) and puts the result in RXB. - Returns 0 if the pattern was valid, otherwise an error string. - - Assumes the `allocated' (and perhaps `buffer') and `translate' fields - are set in RXB on entry. - - We call rx_compile to do the actual compilation. */ - -#ifdef __STDC__ -__const__ char *re_compile_pattern(__const__ char *pattern, - int length, - struct re_pattern_buffer *rxb) -#else -__const__ char *re_compile_pattern(pattern, length, rxb) -__const__ char *pattern; -int length; -struct re_pattern_buffer *rxb; -#endif -{ - reg_errcode_t ret; - - /* GNU code is written to assume at least RE_NREGS registers will be set - (and at least one extra will be -1). */ - rxb->regs_allocated = REGS_UNALLOCATED; - - /* And GNU code determines whether or not to get register information - by passing null for the REGS argument to re_match, etc., not by - setting no_sub. */ - rxb->no_sub = 0; - - rxb->rx.local_cset_size = 256; - - /* Match anchors at newline. */ - rxb->newline_anchor = 1; - - rxb->re_nsub = 0; - rxb->start = 0; - rxb->se_params = 0; - rxb->rx.nodec = 0; - rxb->rx.epsnodec = 0; - rxb->rx.instruction_table = 0; - rxb->rx.nfa_states = 0; - rxb->rx.se_list_cmp = posix_se_list_order; - rxb->rx.start_set = 0; - - ret = rx_compile(pattern, length, re_syntax_options, rxb); - alloca(0); - return rx_error_msg[(int) ret]; -} - - -#ifdef __STDC__ -int re_compile_fastmap(struct re_pattern_buffer *rxb) -#else -int re_compile_fastmap(rxb) -struct re_pattern_buffer *rxb; -#endif -{ - rx_blow_up_fastmap(rxb); - return 0; -} - - - - -/* Entry points compatible with 4.2 BSD regex library. We don't define - them if this is an Emacs or POSIX compilation. */ - -#if (!defined (emacs) && !defined (_POSIX_SOURCE)) || defined(USE_BSD_REGEX) - -/* BSD has one and only one pattern buffer. */ -static struct re_pattern_buffer rx_comp_buf; - -#ifdef __STDC__ -char *re_comp(__const__ char *s) -#else -char *re_comp(s) -__const__ char *s; -#endif -{ - reg_errcode_t ret; - - if (!s || (*s == '\0')) { - if (!rx_comp_buf.buffer) - return "No previous regular expression"; - return 0; - } - - if (!rx_comp_buf.fastmap) { - rx_comp_buf.fastmap = (char *) malloc(1 << CHARBITS); - if (!rx_comp_buf.fastmap) - return "Memory exhausted"; - } - - /* Since `rx_exec' always passes NULL for the `regs' argument, we - don't need to initialize the pattern buffer fields which affect it. */ - - /* Match anchors at newlines. */ - rx_comp_buf.newline_anchor = 1; - - rx_comp_buf.re_nsub = 0; - rx_comp_buf.start = 0; - rx_comp_buf.se_params = 0; - rx_comp_buf.rx.nodec = 0; - rx_comp_buf.rx.epsnodec = 0; - rx_comp_buf.rx.instruction_table = 0; - rx_comp_buf.rx.nfa_states = 0; - rx_comp_buf.rx.start = 0; - rx_comp_buf.rx.se_list_cmp = posix_se_list_order; - rx_comp_buf.rx.start_set = 0; - rx_comp_buf.rx.local_cset_size = 256; - - ret = rx_compile(s, strlen(s), re_syntax_options, &rx_comp_buf); - alloca(0); - - /* Yes, we're discarding `__const__' here. */ - return (char *) rx_error_msg[(int) ret]; -} - - -#ifdef __STDC__ -int re_exec(__const__ char *s) -#else -int re_exec(s) -__const__ char *s; -#endif -{ - __const__ int len = strlen(s); - - return - 0 <= re_search(&rx_comp_buf, s, len, 0, len, - (struct re_registers *) 0); -} -#endif /* not emacs and not _POSIX_SOURCE */ - - - -/* POSIX.2 functions. Don't define these for Emacs. */ - -#if !defined(emacs) - -/* regcomp takes a regular expression as a string and compiles it. - - PREG is a regex_t *. We do not expect any fields to be initialized, - since POSIX says we shouldn't. Thus, we set - - `buffer' to the compiled pattern; - `used' to the length of the compiled pattern; - `syntax' to RE_SYNTAX_POSIX_EXTENDED if the - REG_EXTENDED bit in CFLAGS is set; otherwise, to - RE_SYNTAX_POSIX_BASIC; - `newline_anchor' to REG_NEWLINE being set in CFLAGS; - `fastmap' and `fastmap_accurate' to zero; - `re_nsub' to the number of subexpressions in PATTERN. - - PATTERN is the address of the pattern string. - - CFLAGS is a series of bits which affect compilation. - - If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we - use POSIX basic syntax. - - If REG_NEWLINE is set, then . and [^...] don't match newline. - Also, regexec will try a match beginning after every newline. - - If REG_ICASE is set, then we considers upper- and lowercase - versions of letters to be equivalent when matching. - - If REG_NOSUB is set, then when PREG is passed to regexec, that - routine will report only success or failure, and nothing about the - registers. - - It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for - the return codes and their meanings.) */ - - -#ifdef __STDC__ -int regcomp(regex_t * preg, __const__ char *pattern, int cflags) -#else -int regcomp(preg, pattern, cflags) -regex_t *preg; -__const__ char *pattern; -int cflags; -#endif -{ - reg_errcode_t ret; - unsigned syntax - - = - cflags & REG_EXTENDED ? RE_SYNTAX_POSIX_EXTENDED : - RE_SYNTAX_POSIX_BASIC; - - /* regex_compile will allocate the space for the compiled pattern. */ - preg->buffer = 0; - preg->allocated = 0; - preg->fastmap = malloc(256); - if (!preg->fastmap) - return REG_ESPACE; - preg->fastmap_accurate = 0; - - if (cflags & REG_ICASE) { - unsigned i; - - preg->translate = (unsigned char *) malloc(256); - if (!preg->translate) - return (int) REG_ESPACE; - - /* Map uppercase characters to corresponding lowercase ones. */ - for (i = 0; i < CHAR_SET_SIZE; i++) - preg->translate[i] = isupper(i) ? tolower(i) : i; - } else - preg->translate = 0; - - /* If REG_NEWLINE is set, newlines are treated differently. */ - if (cflags & REG_NEWLINE) { /* REG_NEWLINE implies neither . nor [^...] match newline. */ - syntax &= ~RE_DOT_NEWLINE; - syntax |= RE_HAT_LISTS_NOT_NEWLINE; - /* It also changes the matching behavior. */ - preg->newline_anchor = 1; - } else - preg->newline_anchor = 0; - - preg->no_sub = !!(cflags & REG_NOSUB); - - /* POSIX says a null character in the pattern terminates it, so we - can use strlen here in compiling the pattern. */ - preg->re_nsub = 0; - preg->start = 0; - preg->se_params = 0; - preg->syntax_parens = 0; - preg->rx.nodec = 0; - preg->rx.epsnodec = 0; - preg->rx.instruction_table = 0; - preg->rx.nfa_states = 0; - preg->rx.local_cset_size = 256; - preg->rx.start = 0; - preg->rx.se_list_cmp = posix_se_list_order; - preg->rx.start_set = 0; - ret = rx_compile(pattern, strlen(pattern), syntax, preg); - alloca(0); - - /* POSIX doesn't distinguish between an unmatched open-group and an - unmatched close-group: both are REG_EPAREN. */ - if (ret == REG_ERPAREN) - ret = REG_EPAREN; - - return (int) ret; -} - - -/* regexec searches for a given pattern, specified by PREG, in the - string STRING. - - If NMATCH is zero or REG_NOSUB was set in the cflags argument to - `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at - least NMATCH elements, and we set them to the offsets of the - corresponding matched substrings. - - EFLAGS specifies `execution flags' which affect matching: if - REG_NOTBOL is set, then ^ does not match at the beginning of the - string; if REG_NOTEOL is set, then $ does not match at the end. - - We return 0 if we find a match and REG_NOMATCH if not. */ - -#ifdef __STDC__ -int -regexec(__const__ regex_t * preg, __const__ char *string, - size_t nmatch, regmatch_t pmatch[], int eflags) -#else -int regexec(preg, string, nmatch, pmatch, eflags) -__const__ regex_t *preg; -__const__ char *string; -size_t nmatch; -regmatch_t pmatch[]; -int eflags; -#endif -{ - int ret; - struct re_registers regs; - regex_t private_preg; - int len = strlen(string); - boolean want_reg_info = !preg->no_sub && nmatch > 0; - - private_preg = *preg; - - private_preg.not_bol = !!(eflags & REG_NOTBOL); - private_preg.not_eol = !!(eflags & REG_NOTEOL); - - /* The user has told us exactly how many registers to return - * information about, via `nmatch'. We have to pass that on to the - * matching routines. - */ - private_preg.regs_allocated = REGS_FIXED; - - if (want_reg_info) { - regs.num_regs = nmatch; - regs.start = ((regoff_t *) malloc((nmatch) * sizeof(regoff_t))); - regs.end = ((regoff_t *) malloc((nmatch) * sizeof(regoff_t))); - if (regs.start == 0 || regs.end == 0) - return (int) REG_NOMATCH; - } - - /* Perform the searching operation. */ - ret = re_search(&private_preg, string, len, - /* start: */ 0, - /* range: */ len, - want_reg_info ? ®s : (struct re_registers *) 0); - - /* Copy the register information to the POSIX structure. */ - if (want_reg_info) { - if (ret >= 0) { - unsigned r; - - for (r = 0; r < nmatch; r++) { - pmatch[r].rm_so = regs.start[r]; - pmatch[r].rm_eo = regs.end[r]; - } - } - - /* If we needed the temporary register info, free the space now. */ - free(regs.start); - free(regs.end); - } - - /* We want zero return to mean success, unlike `re_search'. */ - return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; -} - - -/* Returns a message corresponding to an error code, ERRCODE, returned - from either regcomp or regexec. */ - -#ifdef __STDC__ -size_t -regerror(int errcode, __const__ regex_t * preg, - char *errbuf, size_t errbuf_size) -#else -size_t regerror(errcode, preg, errbuf, errbuf_size) -int errcode; -__const__ regex_t *preg; -char *errbuf; -size_t errbuf_size; -#endif -{ - __const__ char *msg - = rx_error_msg[errcode] == 0 ? "Success" : rx_error_msg[errcode]; - size_t msg_size = strlen(msg) + 1; /* Includes the 0. */ - - if (errbuf_size != 0) { - if (msg_size > errbuf_size) { - strncpy(errbuf, msg, errbuf_size - 1); - errbuf[errbuf_size - 1] = 0; - } else - strcpy(errbuf, msg); - } - - return msg_size; -} - - -/* Free dynamically allocated space used by PREG. */ - -#ifdef __STDC__ -void regfree(regex_t * preg) -#else -void regfree(preg) -regex_t *preg; -#endif -{ - if (preg->buffer != 0) - free(preg->buffer); - preg->buffer = 0; - preg->allocated = 0; - - if (preg->fastmap != 0) - free(preg->fastmap); - preg->fastmap = 0; - preg->fastmap_accurate = 0; - - if (preg->translate != 0) - free(preg->translate); - preg->translate = 0; -} - -#endif /* not emacs */ |