diff options
Diffstat (limited to 'libc/misc/regex/regexec.c')
-rw-r--r-- | libc/misc/regex/regexec.c | 4335 |
1 files changed, 4335 insertions, 0 deletions
diff --git a/libc/misc/regex/regexec.c b/libc/misc/regex/regexec.c new file mode 100644 index 000000000..a985d9e7e --- /dev/null +++ b/libc/misc/regex/regexec.c @@ -0,0 +1,4335 @@ +/* Extended regular expression matching and search library. + Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, + int n) internal_function; +static void match_ctx_clean (re_match_context_t *mctx) internal_function; +static void match_ctx_free (re_match_context_t *cache) internal_function; +static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, + int str_idx, int from, int to) + internal_function; +static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) + internal_function; +static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, + int str_idx) internal_function; +static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, + int node, int str_idx) + internal_function; +static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, + re_dfastate_t **limited_sts, int last_node, + int last_str_idx) + internal_function; +static reg_errcode_t re_search_internal (const regex_t *preg, + const char *string, int length, + int start, int range, int stop, + size_t nmatch, regmatch_t pmatch[], + int eflags) internal_function; +static int re_search_2_stub (struct re_pattern_buffer *bufp, + const char *string1, int length1, + const char *string2, int length2, + int start, int range, struct re_registers *regs, + int stop, int ret_len) internal_function; +static int re_search_stub (struct re_pattern_buffer *bufp, + const char *string, int length, int start, + int range, int stop, struct re_registers *regs, + int ret_len) internal_function; +static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, + int nregs, int regs_allocated) internal_function; +static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) + internal_function; +static int check_matching (re_match_context_t *mctx, int fl_longest_match, + int *p_match_first) internal_function; +static int check_halt_state_context (const re_match_context_t *mctx, + const re_dfastate_t *state, int idx) + internal_function; +static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, + regmatch_t *prev_idx_match, int cur_node, + int cur_idx, int nmatch) internal_function; +static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, + int str_idx, int dest_node, int nregs, + regmatch_t *regs, + re_node_set *eps_via_nodes) + internal_function; +static reg_errcode_t set_regs (const regex_t *preg, + const re_match_context_t *mctx, + size_t nmatch, regmatch_t *pmatch, + int fl_backtrack) internal_function; +static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) + internal_function; + +#ifdef RE_ENABLE_I18N +static int sift_states_iter_mb (const re_match_context_t *mctx, + re_sift_context_t *sctx, + int node_idx, int str_idx, int max_str_idx) + internal_function; +#endif /* RE_ENABLE_I18N */ +static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, + re_sift_context_t *sctx) + internal_function; +static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, + re_sift_context_t *sctx, int str_idx, + re_node_set *cur_dest) + internal_function; +static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx, + re_sift_context_t *sctx, + int str_idx, + re_node_set *dest_nodes) + internal_function; +static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa, + re_node_set *dest_nodes, + const re_node_set *candidates) + internal_function; +static int check_dst_limits (const re_match_context_t *mctx, + re_node_set *limits, + int dst_node, int dst_idx, int src_node, + int src_idx) internal_function; +static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, + int boundaries, int subexp_idx, + int from_node, int bkref_idx) + internal_function; +static int check_dst_limits_calc_pos (const re_match_context_t *mctx, + int limit, int subexp_idx, + int node, int str_idx, + int bkref_idx) internal_function; +static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa, + re_node_set *dest_nodes, + const re_node_set *candidates, + re_node_set *limits, + struct re_backref_cache_entry *bkref_ents, + int str_idx) internal_function; +static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx, + re_sift_context_t *sctx, + int str_idx, const re_node_set *candidates) + internal_function; +static reg_errcode_t merge_state_array (const re_dfa_t *dfa, + re_dfastate_t **dst, + re_dfastate_t **src, int num) + internal_function; +static re_dfastate_t *find_recover_state (reg_errcode_t *err, + re_match_context_t *mctx) internal_function; +static re_dfastate_t *transit_state (reg_errcode_t *err, + re_match_context_t *mctx, + re_dfastate_t *state) internal_function; +static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, + re_match_context_t *mctx, + re_dfastate_t *next_state) + internal_function; +static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, + re_node_set *cur_nodes, + int str_idx) internal_function; +#if 0 +static re_dfastate_t *transit_state_sb (reg_errcode_t *err, + re_match_context_t *mctx, + re_dfastate_t *pstate) + internal_function; +#endif +#ifdef RE_ENABLE_I18N +static reg_errcode_t transit_state_mb (re_match_context_t *mctx, + re_dfastate_t *pstate) + internal_function; +#endif /* RE_ENABLE_I18N */ +static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, + const re_node_set *nodes) + internal_function; +static reg_errcode_t get_subexp (re_match_context_t *mctx, + int bkref_node, int bkref_str_idx) + internal_function; +static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, + const re_sub_match_top_t *sub_top, + re_sub_match_last_t *sub_last, + int bkref_node, int bkref_str) + internal_function; +static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, + int subexp_idx, int type) internal_function; +static reg_errcode_t check_arrival (re_match_context_t *mctx, + state_array_t *path, int top_node, + int top_str, int last_node, int last_str, + int type) internal_function; +static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, + int str_idx, + re_node_set *cur_nodes, + re_node_set *next_nodes) + internal_function; +static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa, + re_node_set *cur_nodes, + int ex_subexp, int type) + internal_function; +static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa, + re_node_set *dst_nodes, + int target, int ex_subexp, + int type) internal_function; +static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, + re_node_set *cur_nodes, int cur_str, + int subexp_num, int type) + internal_function; +static int build_trtable (const re_dfa_t *dfa, + re_dfastate_t *state) internal_function; +#ifdef RE_ENABLE_I18N +static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, + const re_string_t *input, int idx) + internal_function; +# ifdef _LIBC +static unsigned int find_collation_sequence_value (const unsigned char *mbs, + size_t name_len) + internal_function; +# endif /* _LIBC */ +#endif /* RE_ENABLE_I18N */ +static int group_nodes_into_DFAstates (const re_dfa_t *dfa, + const re_dfastate_t *state, + re_node_set *states_node, + bitset_t *states_ch) internal_function; +static int check_node_accept (const re_match_context_t *mctx, + const re_token_t *node, int idx) + internal_function; +static reg_errcode_t extend_buffers (re_match_context_t *mctx) + internal_function; + +/* Entry point for POSIX code. */ + +/* regexec searches for a given pattern, specified by PREG, in the + string STRING. + + If NMATCH is zero or REG_NOSUB was set in the cflags argument to + `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at + least NMATCH elements, and we set them to the offsets of the + corresponding matched substrings. + + EFLAGS specifies `execution flags' which affect matching: if + REG_NOTBOL is set, then ^ does not match at the beginning of the + string; if REG_NOTEOL is set, then $ does not match at the end. + + We return 0 if we find a match and REG_NOMATCH if not. */ + +int +regexec (preg, string, nmatch, pmatch, eflags) + const regex_t *__restrict preg; + const char *__restrict string; + size_t nmatch; + regmatch_t pmatch[]; + int eflags; +{ + reg_errcode_t err; + int start, length; +#ifndef __UCLIBC__ /* libc_lock_lock does not exist */ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; +#endif + + if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) + return REG_BADPAT; + + if (eflags & REG_STARTEND) + { + start = pmatch[0].rm_so; + length = pmatch[0].rm_eo; + } + else + { + start = 0; + length = strlen (string); + } + + __libc_lock_lock (dfa->lock); + if (preg->no_sub) + err = re_search_internal (preg, string, length, start, length - start, + length, 0, NULL, eflags); + else + err = re_search_internal (preg, string, length, start, length - start, + length, nmatch, pmatch, eflags); + __libc_lock_unlock (dfa->lock); + return err != REG_NOERROR; +} + +#ifdef _LIBC +# include <shlib-compat.h> +versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); + +# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) +__typeof__ (__regexec) __compat_regexec; + +int +attribute_compat_text_section +__compat_regexec (const regex_t *__restrict preg, + const char *__restrict string, size_t nmatch, + regmatch_t pmatch[], int eflags) +{ + return regexec (preg, string, nmatch, pmatch, + eflags & (REG_NOTBOL | REG_NOTEOL)); +} +compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); +# endif +#endif + +/* Entry points for GNU code. */ + +/* re_match, re_search, re_match_2, re_search_2 + + The former two functions operate on STRING with length LENGTH, + while the later two operate on concatenation of STRING1 and STRING2 + with lengths LENGTH1 and LENGTH2, respectively. + + re_match() matches the compiled pattern in BUFP against the string, + starting at index START. + + re_search() first tries matching at index START, then it tries to match + starting from index START + 1, and so on. The last start position tried + is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same + way as re_match().) + + The parameter STOP of re_{match,search}_2 specifies that no match exceeding + the first STOP characters of the concatenation of the strings should be + concerned. + + If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match + and all groups is stroed in REGS. (For the "_2" variants, the offsets are + computed relative to the concatenation, not relative to the individual + strings.) + + On success, re_match* functions return the length of the match, re_search* + return the position of the start of the match. Return value -1 means no + match was found and -2 indicates an internal error. */ + +int +re_match (bufp, string, length, start, regs) + struct re_pattern_buffer *bufp; + const char *string; + int length, start; + struct re_registers *regs; +{ + return re_search_stub (bufp, string, length, start, 0, length, regs, 1); +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__re_match, re_match) +#endif + +int +re_search (bufp, string, length, start, range, regs) + struct re_pattern_buffer *bufp; + const char *string; + int length, start, range; + struct re_registers *regs; +{ + return re_search_stub (bufp, string, length, start, range, length, regs, 0); +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__re_search, re_search) +#endif + +int +re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int length1, length2, start, stop; + struct re_registers *regs; +{ + return re_search_2_stub (bufp, string1, length1, string2, length2, + start, 0, regs, stop, 1); +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__re_match_2, re_match_2) +#endif + +int +re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int length1, length2, start, range, stop; + struct re_registers *regs; +{ + return re_search_2_stub (bufp, string1, length1, string2, length2, + start, range, regs, stop, 0); +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__re_search_2, re_search_2) +#endif + +static int +re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, + stop, ret_len) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int length1, length2, start, range, stop, ret_len; + struct re_registers *regs; +{ + const char *str; + int rval; + int len = length1 + length2; + int free_str = 0; + + if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) + return -2; + + /* Concatenate the strings. */ + if (length2 > 0) + if (length1 > 0) + { + char *s = re_malloc (char, len); + + if (BE (s == NULL, 0)) + return -2; +#ifdef _LIBC + memcpy (__mempcpy (s, string1, length1), string2, length2); +#else + memcpy (s, string1, length1); + memcpy (s + length1, string2, length2); +#endif + str = s; + free_str = 1; + } + else + str = string2; + else + str = string1; + + rval = re_search_stub (bufp, str, len, start, range, stop, regs, + ret_len); + if (free_str) + re_free ((char *) str); + return rval; +} + +/* The parameters have the same meaning as those of re_search. + Additional parameters: + If RET_LEN is nonzero the length of the match is returned (re_match style); + otherwise the position of the match is returned. */ + +static int +re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) + struct re_pattern_buffer *bufp; + const char *string; + int length, start, range, stop, ret_len; + struct re_registers *regs; +{ + reg_errcode_t result; + regmatch_t *pmatch; + int nregs, rval; + int eflags = 0; +#ifndef __UCLIBC__ /* libc_lock_lock does not exist */ + re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; +#endif + + /* Check for out-of-range. */ + if (BE (start < 0 || start > length, 0)) + return -1; + if (BE (start + range > length, 0)) + range = length - start; + else if (BE (start + range < 0, 0)) + range = -start; + + __libc_lock_lock (dfa->lock); + + eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; + eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; + + /* Compile fastmap if we haven't yet. */ + if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) + re_compile_fastmap (bufp); + + if (BE (bufp->no_sub, 0)) + regs = NULL; + + /* We need at least 1 register. */ + if (regs == NULL) + nregs = 1; + else if (BE (bufp->regs_allocated == REGS_FIXED && + regs->num_regs < bufp->re_nsub + 1, 0)) + { + nregs = regs->num_regs; + if (BE (nregs < 1, 0)) + { + /* Nothing can be copied to regs. */ + regs = NULL; + nregs = 1; + } + } + else + nregs = bufp->re_nsub + 1; + pmatch = re_malloc (regmatch_t, nregs); + if (BE (pmatch == NULL, 0)) + { + rval = -2; + goto out; + } + + result = re_search_internal (bufp, string, length, start, range, stop, + nregs, pmatch, eflags); + + rval = 0; + + /* I hope we needn't fill ther regs with -1's when no match was found. */ + if (result != REG_NOERROR) + rval = -1; + else if (regs != NULL) + { + /* If caller wants register contents data back, copy them. */ + bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, + bufp->regs_allocated); + if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) + rval = -2; + } + + if (BE (rval == 0, 1)) + { + if (ret_len) + { + assert (pmatch[0].rm_so == start); + rval = pmatch[0].rm_eo - start; + } + else + rval = pmatch[0].rm_so; + } + re_free (pmatch); + out: + __libc_lock_unlock (dfa->lock); + return rval; +} + +static unsigned +re_copy_regs (regs, pmatch, nregs, regs_allocated) + struct re_registers *regs; + regmatch_t *pmatch; + int nregs, regs_allocated; +{ + int rval = REGS_REALLOCATE; + int i; + int need_regs = nregs + 1; + /* We need one extra element beyond `num_regs' for the `-1' marker GNU code + uses. */ + + /* Have the register data arrays been allocated? */ + if (regs_allocated == REGS_UNALLOCATED) + { /* No. So allocate them with malloc. */ + regs->start = re_malloc (regoff_t, need_regs); + regs->end = re_malloc (regoff_t, need_regs); + if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0)) + return REGS_UNALLOCATED; + regs->num_regs = need_regs; + } + else if (regs_allocated == REGS_REALLOCATE) + { /* Yes. If we need more elements than were already + allocated, reallocate them. If we need fewer, just + leave it alone. */ + if (BE (need_regs > regs->num_regs, 0)) + { + regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); + regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs); + if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0)) + return REGS_UNALLOCATED; + regs->start = new_start; + regs->end = new_end; + regs->num_regs = need_regs; + } + } + else + { + assert (regs_allocated == REGS_FIXED); + /* This function may not be called with REGS_FIXED and nregs too big. */ + assert (regs->num_regs >= nregs); + rval = REGS_FIXED; + } + + /* Copy the regs. */ + for (i = 0; i < nregs; ++i) + { + regs->start[i] = pmatch[i].rm_so; + regs->end[i] = pmatch[i].rm_eo; + } + for ( ; i < regs->num_regs; ++i) + regs->start[i] = regs->end[i] = -1; + + return rval; +} + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use + this memory for recording register information. STARTS and ENDS + must be allocated using the malloc library routine, and must each + be at least NUM_REGS * sizeof (regoff_t) bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ + +void +re_set_registers (bufp, regs, num_regs, starts, ends) + struct re_pattern_buffer *bufp; + struct re_registers *regs; + unsigned num_regs; + regoff_t *starts, *ends; +{ + if (num_regs) + { + bufp->regs_allocated = REGS_REALLOCATE; + regs->num_regs = num_regs; + regs->start = starts; + regs->end = ends; + } + else + { + bufp->regs_allocated = REGS_UNALLOCATED; + regs->num_regs = 0; + regs->start = regs->end = (regoff_t *) 0; + } +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__re_set_registers, re_set_registers) +#endif + +/* Entry points compatible with 4.2 BSD regex library. We don't define + them unless specifically requested. */ + +#if defined _REGEX_RE_COMP || defined _LIBC || defined __UCLIBC__ +int +# if defined _LIBC || defined __UCLIBC__ +weak_function +# endif +re_exec (s) + const char *s; +{ + return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); +} +#endif /* _REGEX_RE_COMP */ + +/* Internal entry point. */ + +/* Searches for a compiled pattern PREG in the string STRING, whose + length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same + mingings with regexec. START, and RANGE have the same meanings + with re_search. + Return REG_NOERROR if we find a match, and REG_NOMATCH if not, + otherwise return the error code. + Note: We assume front end functions already check ranges. + (START + RANGE >= 0 && START + RANGE <= LENGTH) */ + +static reg_errcode_t +re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, + eflags) + const regex_t *preg; + const char *string; + int length, start, range, stop, eflags; + size_t nmatch; + regmatch_t pmatch[]; +{ + reg_errcode_t err; + const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; + int left_lim, right_lim, incr; + int fl_longest_match, match_first, match_kind, match_last = -1; + int extra_nmatch; + int sb, ch; +#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) + re_match_context_t mctx = { .dfa = dfa }; +#else + re_match_context_t mctx; +#endif + char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate + && range && !preg->can_be_null) ? preg->fastmap : NULL; + RE_TRANSLATE_TYPE t = preg->translate; + +#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) + memset (&mctx, '\0', sizeof (re_match_context_t)); + mctx.dfa = dfa; +#endif + + extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; + nmatch -= extra_nmatch; + + /* Check if the DFA haven't been compiled. */ + if (BE (preg->used == 0 || dfa->init_state == NULL + || dfa->init_state_word == NULL || dfa->init_state_nl == NULL + || dfa->init_state_begbuf == NULL, 0)) + return REG_NOMATCH; + +#ifdef DEBUG + /* We assume front-end functions already check them. */ + assert (start + range >= 0 && start + range <= length); +#endif + + /* If initial states with non-begbuf contexts have no elements, + the regex must be anchored. If preg->newline_anchor is set, + we'll never use init_state_nl, so do not check it. */ + if (dfa->init_state->nodes.nelem == 0 + && dfa->init_state_word->nodes.nelem == 0 + && (dfa->init_state_nl->nodes.nelem == 0 + || !preg->newline_anchor)) + { + if (start != 0 && start + range != 0) + return REG_NOMATCH; + start = range = 0; + } + + /* We must check the longest matching, if nmatch > 0. */ + fl_longest_match = (nmatch != 0 || dfa->nbackref); + + err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, + preg->translate, preg->syntax & RE_ICASE, dfa); + if (BE (err != REG_NOERROR, 0)) + goto free_return; + mctx.input.stop = stop; + mctx.input.raw_stop = stop; + mctx.input.newline_anchor = preg->newline_anchor; + + err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); + if (BE (err != REG_NOERROR, 0)) + goto free_return; + + /* We will log all the DFA states through which the dfa pass, + if nmatch > 1, or this dfa has "multibyte node", which is a + back-reference or a node which can accept multibyte character or + multi character collating element. */ + if (nmatch > 1 || dfa->has_mb_node) + { + mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); + if (BE (mctx.state_log == NULL, 0)) + { + err = REG_ESPACE; + goto free_return; + } + } + else + mctx.state_log = NULL; + + match_first = start; + mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF + : CONTEXT_NEWLINE | CONTEXT_BEGBUF; + + /* Check incrementally whether of not the input string match. */ + incr = (range < 0) ? -1 : 1; + left_lim = (range < 0) ? start + range : start; + right_lim = (range < 0) ? start : start + range; + sb = dfa->mb_cur_max == 1; + match_kind = + (fastmap + ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) + | (range >= 0 ? 2 : 0) + | (t != NULL ? 1 : 0)) + : 8); + + for (;; match_first += incr) + { + err = REG_NOMATCH; + if (match_first < left_lim || right_lim < match_first) + goto free_return; + + /* Advance as rapidly as possible through the string, until we + find a plausible place to start matching. This may be done + with varying efficiency, so there are various possibilities: + only the most common of them are specialized, in order to + save on code size. We use a switch statement for speed. */ + switch (match_kind) + { + case 8: + /* No fastmap. */ + break; + + case 7: + /* Fastmap with single-byte translation, match forward. */ + while (BE (match_first < right_lim, 1) + && !fastmap[t[(unsigned char) string[match_first]]]) + ++match_first; + goto forward_match_found_start_or_reached_end; + + case 6: + /* Fastmap without translation, match forward. */ + while (BE (match_first < right_lim, 1) + && !fastmap[(unsigned char) string[match_first]]) + ++match_first; + + forward_match_found_start_or_reached_end: + if (BE (match_first == right_lim, 0)) + { + ch = match_first >= length + ? 0 : (unsigned char) string[match_first]; + if (!fastmap[t ? t[ch] : ch]) + goto free_return; + } + break; + + case 4: + case 5: + /* Fastmap without multi-byte translation, match backwards. */ + while (match_first >= left_lim) + { + ch = match_first >= length + ? 0 : (unsigned char) string[match_first]; + if (fastmap[t ? t[ch] : ch]) + break; + --match_first; + } + if (match_first < left_lim) + goto free_return; + break; + + default: + /* In this case, we can't determine easily the current byte, + since it might be a component byte of a multibyte + character. Then we use the constructed buffer instead. */ + for (;;) + { + /* If MATCH_FIRST is out of the valid range, reconstruct the + buffers. */ + unsigned int offset = match_first - mctx.input.raw_mbs_idx; + if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0)) + { + err = re_string_reconstruct (&mctx.input, match_first, + eflags); + if (BE (err != REG_NOERROR, 0)) + goto free_return; + + offset = match_first - mctx.input.raw_mbs_idx; + } + /* If MATCH_FIRST is out of the buffer, leave it as '\0'. + Note that MATCH_FIRST must not be smaller than 0. */ + ch = (match_first >= length + ? 0 : re_string_byte_at (&mctx.input, offset)); + if (fastmap[ch]) + break; + match_first += incr; + if (match_first < left_lim || match_first > right_lim) + { + err = REG_NOMATCH; + goto free_return; + } + } + break; + } + + /* Reconstruct the buffers so that the matcher can assume that + the matching starts from the beginning of the buffer. */ + err = re_string_reconstruct (&mctx.input, match_first, eflags); + if (BE (err != REG_NOERROR, 0)) + goto free_return; + +#ifdef RE_ENABLE_I18N + /* Don't consider this char as a possible match start if it part, + yet isn't the head, of a multibyte character. */ + if (!sb && !re_string_first_byte (&mctx.input, 0)) + continue; +#endif + + /* It seems to be appropriate one, then use the matcher. */ + /* We assume that the matching starts from 0. */ + mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; + match_last = check_matching (&mctx, fl_longest_match, + range >= 0 ? &match_first : NULL); + if (match_last != -1) + { + if (BE (match_last == -2, 0)) + { + err = REG_ESPACE; + goto free_return; + } + else + { + mctx.match_last = match_last; + if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) + { + re_dfastate_t *pstate = mctx.state_log[match_last]; + mctx.last_node = check_halt_state_context (&mctx, pstate, + match_last); + } + if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) + || dfa->nbackref) + { + err = prune_impossible_nodes (&mctx); + if (err == REG_NOERROR) + break; + if (BE (err != REG_NOMATCH, 0)) + goto free_return; + match_last = -1; + } + else + break; /* We found a match. */ + } + } + + match_ctx_clean (&mctx); + } + +#ifdef DEBUG + assert (match_last != -1); + assert (err == REG_NOERROR); +#endif + + /* Set pmatch[] if we need. */ + if (nmatch > 0) + { + int reg_idx; + + /* Initialize registers. */ + for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) + pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; + + /* Set the points where matching start/end. */ + pmatch[0].rm_so = 0; + pmatch[0].rm_eo = mctx.match_last; + + if (!preg->no_sub && nmatch > 1) + { + err = set_regs (preg, &mctx, nmatch, pmatch, + dfa->has_plural_match && dfa->nbackref > 0); + if (BE (err != REG_NOERROR, 0)) + goto free_return; + } + + /* At last, add the offset to the each registers, since we slided + the buffers so that we could assume that the matching starts + from 0. */ + for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) + if (pmatch[reg_idx].rm_so != -1) + { +#ifdef RE_ENABLE_I18N + if (BE (mctx.input.offsets_needed != 0, 0)) + { + pmatch[reg_idx].rm_so = + (pmatch[reg_idx].rm_so == mctx.input.valid_len + ? mctx.input.valid_raw_len + : mctx.input.offsets[pmatch[reg_idx].rm_so]); + pmatch[reg_idx].rm_eo = + (pmatch[reg_idx].rm_eo == mctx.input.valid_len + ? mctx.input.valid_raw_len + : mctx.input.offsets[pmatch[reg_idx].rm_eo]); + } +#else + assert (mctx.input.offsets_needed == 0); +#endif + pmatch[reg_idx].rm_so += match_first; + pmatch[reg_idx].rm_eo += match_first; + } + for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) + { + pmatch[nmatch + reg_idx].rm_so = -1; + pmatch[nmatch + reg_idx].rm_eo = -1; + } + + if (dfa->subexp_map) + for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) + if (dfa->subexp_map[reg_idx] != reg_idx) + { + pmatch[reg_idx + 1].rm_so + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; + pmatch[reg_idx + 1].rm_eo + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; + } + } + + free_return: + re_free (mctx.state_log); + if (dfa->nbackref) + match_ctx_free (&mctx); + re_string_destruct (&mctx.input); + return err; +} + +static reg_errcode_t +prune_impossible_nodes (mctx) + re_match_context_t *mctx; +{ + const re_dfa_t *const dfa = mctx->dfa; + int halt_node, match_last; + reg_errcode_t ret; + re_dfastate_t **sifted_states; + re_dfastate_t **lim_states = NULL; + re_sift_context_t sctx; +#ifdef DEBUG + assert (mctx->state_log != NULL); +#endif + match_last = mctx->match_last; + halt_node = mctx->last_node; + sifted_states = re_malloc (re_dfastate_t *, match_last + 1); + if (BE (sifted_states == NULL, 0)) + { + ret = REG_ESPACE; + goto free_return; + } + if (dfa->nbackref) + { + lim_states = re_malloc (re_dfastate_t *, match_last + 1); + if (BE (lim_states == NULL, 0)) + { + ret = REG_ESPACE; + goto free_return; + } + while (1) + { + memset (lim_states, '\0', + sizeof (re_dfastate_t *) * (match_last + 1)); + sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, + match_last); + ret = sift_states_backward (mctx, &sctx); + re_node_set_free (&sctx.limits); + if (BE (ret != REG_NOERROR, 0)) + goto free_return; + if (sifted_states[0] != NULL || lim_states[0] != NULL) + break; + do + { + --match_last; + if (match_last < 0) + { + ret = REG_NOMATCH; + goto free_return; + } + } while (mctx->state_log[match_last] == NULL + || !mctx->state_log[match_last]->halt); + halt_node = check_halt_state_context (mctx, + mctx->state_log[match_last], + match_last); + } + ret = merge_state_array (dfa, sifted_states, lim_states, + match_last + 1); + re_free (lim_states); + lim_states = NULL; + if (BE (ret != REG_NOERROR, 0)) + goto free_return; + } + else + { + sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); + ret = sift_states_backward (mctx, &sctx); + re_node_set_free (&sctx.limits); + if (BE (ret != REG_NOERROR, 0)) + goto free_return; + } + re_free (mctx->state_log); + mctx->state_log = sifted_states; + sifted_states = NULL; + mctx->last_node = halt_node; + mctx->match_last = match_last; + ret = REG_NOERROR; + free_return: + re_free (sifted_states); + re_free (lim_states); + return ret; +} + +/* Acquire an initial state and return it. + We must select appropriate initial state depending on the context, + since initial states may have constraints like "\<", "^", etc.. */ + +static inline re_dfastate_t * +__attribute ((always_inline)) internal_function +acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, + int idx) +{ + const re_dfa_t *const dfa = mctx->dfa; + if (dfa->init_state->has_constraint) + { + unsigned int context; + context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); + if (IS_WORD_CONTEXT (context)) + return dfa->init_state_word; + else if (IS_ORDINARY_CONTEXT (context)) + return dfa->init_state; + else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) + return dfa->init_state_begbuf; + else if (IS_NEWLINE_CONTEXT (context)) + return dfa->init_state_nl; + else if (IS_BEGBUF_CONTEXT (context)) + { + /* It is relatively rare case, then calculate on demand. */ + return re_acquire_state_context (err, dfa, + dfa->init_state->entrance_nodes, + context); + } + else + /* Must not happen? */ + return dfa->init_state; + } + else + return dfa->init_state; +} + +/* Check whether the regular expression match input string INPUT or not, + and return the index where the matching end, return -1 if not match, + or return -2 in case of an error. + FL_LONGEST_MATCH means we want the POSIX longest matching. + If P_MATCH_FIRST is not NULL, and the match fails, it is set to the + next place where we may want to try matching. + Note that the matcher assume that the maching starts from the current |