diff options
Diffstat (limited to 'system/lib')
-rw-r--r-- | system/lib/libc/musl/src/regex/regcomp.c | 3352 | ||||
-rw-r--r-- | system/lib/libc/musl/src/regex/regerror.c | 35 | ||||
-rw-r--r-- | system/lib/libc/musl/src/regex/regexec.c | 1011 | ||||
-rw-r--r-- | system/lib/libc/musl/src/regex/tre-mem.c | 158 | ||||
-rw-r--r-- | system/lib/libc/musl/src/regex/tre.h | 231 | ||||
-rw-r--r-- | system/lib/libcextra.symbols | 7 |
6 files changed, 4794 insertions, 0 deletions
diff --git a/system/lib/libc/musl/src/regex/regcomp.c b/system/lib/libc/musl/src/regex/regcomp.c new file mode 100644 index 00000000..5cedfd52 --- /dev/null +++ b/system/lib/libc/musl/src/regex/regcomp.c @@ -0,0 +1,3352 @@ +/* + regcomp.c - TRE POSIX compatible regex compilation functions. + + Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi> + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include <string.h> +#include <errno.h> +#include <stdlib.h> +#include <regex.h> +#include <limits.h> +#include <stdint.h> + +#include "tre.h" + +#include <assert.h> + +/*********************************************************************** + from tre-compile.h +***********************************************************************/ + +typedef struct { + int position; + int code_min; + int code_max; + int *tags; + int assertions; + tre_ctype_t class; + tre_ctype_t *neg_classes; + int backref; +} tre_pos_and_tags_t; + + +/*********************************************************************** + from tre-ast.c and tre-ast.h +***********************************************************************/ + +/* The different AST node types. */ +typedef enum { + LITERAL, + CATENATION, + ITERATION, + UNION +} tre_ast_type_t; + +/* Special subtypes of TRE_LITERAL. */ +#define EMPTY -1 /* Empty leaf (denotes empty string). */ +#define ASSERTION -2 /* Assertion leaf. */ +#define TAG -3 /* Tag leaf. */ +#define BACKREF -4 /* Back reference leaf. */ + +#define IS_SPECIAL(x) ((x)->code_min < 0) +#define IS_EMPTY(x) ((x)->code_min == EMPTY) +#define IS_ASSERTION(x) ((x)->code_min == ASSERTION) +#define IS_TAG(x) ((x)->code_min == TAG) +#define IS_BACKREF(x) ((x)->code_min == BACKREF) + + +/* A generic AST node. All AST nodes consist of this node on the top + level with `obj' pointing to the actual content. */ +typedef struct { + tre_ast_type_t type; /* Type of the node. */ + void *obj; /* Pointer to actual node. */ + int nullable; + int submatch_id; + int num_submatches; + int num_tags; + tre_pos_and_tags_t *firstpos; + tre_pos_and_tags_t *lastpos; +} tre_ast_node_t; + + +/* A "literal" node. These are created for assertions, back references, + tags, matching parameter settings, and all expressions that match one + character. */ +typedef struct { + long code_min; + long code_max; + int position; + tre_ctype_t class; + tre_ctype_t *neg_classes; +} tre_literal_t; + +/* A "catenation" node. These are created when two regexps are concatenated. + If there are more than one subexpressions in sequence, the `left' part + holds all but the last, and `right' part holds the last subexpression + (catenation is left associative). */ +typedef struct { + tre_ast_node_t *left; + tre_ast_node_t *right; +} tre_catenation_t; + +/* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}" + operators. */ +typedef struct { + /* Subexpression to match. */ + tre_ast_node_t *arg; + /* Minimum number of consecutive matches. */ + int min; + /* Maximum number of consecutive matches. */ + int max; + /* If 0, match as many characters as possible, if 1 match as few as + possible. Note that this does not always mean the same thing as + matching as many/few repetitions as possible. */ + unsigned int minimal:1; +} tre_iteration_t; + +/* An "union" node. These are created for the "|" operator. */ +typedef struct { + tre_ast_node_t *left; + tre_ast_node_t *right; +} tre_union_t; + +static tre_ast_node_t * +tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size); + +static tre_ast_node_t * +tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position); + +static tre_ast_node_t * +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, + int minimal); + +static tre_ast_node_t * +tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right); + +static tre_ast_node_t * +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, + tre_ast_node_t *right); + + +static tre_ast_node_t * +tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size) +{ + tre_ast_node_t *node; + + node = tre_mem_calloc(mem, sizeof(*node)); + if (!node) + return NULL; + node->obj = tre_mem_calloc(mem, size); + if (!node->obj) + return NULL; + node->type = type; + node->nullable = -1; + node->submatch_id = -1; + + return node; +} + +static tre_ast_node_t * +tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) +{ + tre_ast_node_t *node; + tre_literal_t *lit; + + node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t)); + if (!node) + return NULL; + lit = node->obj; + lit->code_min = code_min; + lit->code_max = code_max; + lit->position = position; + + return node; +} + +static tre_ast_node_t * +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, + int minimal) +{ + tre_ast_node_t *node; + tre_iteration_t *iter; + + node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t)); + if (!node) + return NULL; + iter = node->obj; + iter->arg = arg; + iter->min = min; + iter->max = max; + iter->minimal = minimal; + node->num_submatches = arg->num_submatches; + + return node; +} + +static tre_ast_node_t * +tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) +{ + tre_ast_node_t *node; + + node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t)); + if (node == NULL) + return NULL; + ((tre_union_t *)node->obj)->left = left; + ((tre_union_t *)node->obj)->right = right; + node->num_submatches = left->num_submatches + right->num_submatches; + + return node; +} + +static tre_ast_node_t * +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, + tre_ast_node_t *right) +{ + tre_ast_node_t *node; + + node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t)); + if (node == NULL) + return NULL; + ((tre_catenation_t *)node->obj)->left = left; + ((tre_catenation_t *)node->obj)->right = right; + node->num_submatches = left->num_submatches + right->num_submatches; + + return node; +} + + +/*********************************************************************** + from tre-stack.c and tre-stack.h +***********************************************************************/ + +typedef struct tre_stack_rec tre_stack_t; + +/* Creates a new stack object. `size' is initial size in bytes, `max_size' + is maximum size, and `increment' specifies how much more space will be + allocated with realloc() if all space gets used up. Returns the stack + object or NULL if out of memory. */ +static tre_stack_t * +tre_stack_new(int size, int max_size, int increment); + +/* Frees the stack object. */ +static void +tre_stack_destroy(tre_stack_t *s); + +/* Returns the current number of objects in the stack. */ +static int +tre_stack_num_objects(tre_stack_t *s); + +/* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes + `value' on top of stack `s'. Returns REG_ESPACE if out of memory. + This tries to realloc() more space before failing if maximum size + has not yet been reached. Returns REG_OK if successful. */ +#define declare_pushf(typetag, type) \ + static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value) + +declare_pushf(voidptr, void *); +declare_pushf(int, int); + +/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost + element off of stack `s' and returns it. The stack must not be + empty. */ +#define declare_popf(typetag, type) \ + static type tre_stack_pop_ ## typetag(tre_stack_t *s) + +declare_popf(voidptr, void *); +declare_popf(int, int); + +/* Just to save some typing. */ +#define STACK_PUSH(s, typetag, value) \ + do \ + { \ + status = tre_stack_push_ ## typetag(s, value); \ + } \ + while (/*CONSTCOND*/0) + +#define STACK_PUSHX(s, typetag, value) \ + { \ + status = tre_stack_push_ ## typetag(s, value); \ + if (status != REG_OK) \ + break; \ + } + +#define STACK_PUSHR(s, typetag, value) \ + { \ + reg_errcode_t _status; \ + _status = tre_stack_push_ ## typetag(s, value); \ + if (_status != REG_OK) \ + return _status; \ + } + +union tre_stack_item { + void *voidptr_value; + int int_value; +}; + +struct tre_stack_rec { + int size; + int max_size; + int increment; + int ptr; + union tre_stack_item *stack; +}; + + +static tre_stack_t * +tre_stack_new(int size, int max_size, int increment) +{ + tre_stack_t *s; + + s = xmalloc(sizeof(*s)); + if (s != NULL) + { + s->stack = xmalloc(sizeof(*s->stack) * size); + if (s->stack == NULL) + { + xfree(s); + return NULL; + } + s->size = size; + s->max_size = max_size; + s->increment = increment; + s->ptr = 0; + } + return s; +} + +static void +tre_stack_destroy(tre_stack_t *s) +{ + xfree(s->stack); + xfree(s); +} + +static int +tre_stack_num_objects(tre_stack_t *s) +{ + return s->ptr; +} + +static reg_errcode_t +tre_stack_push(tre_stack_t *s, union tre_stack_item value) +{ + if (s->ptr < s->size) + { + s->stack[s->ptr] = value; + s->ptr++; + } + else + { + if (s->size >= s->max_size) + { + return REG_ESPACE; + } + else + { + union tre_stack_item *new_buffer; + int new_size; + new_size = s->size + s->increment; + if (new_size > s->max_size) + new_size = s->max_size; + new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size); + if (new_buffer == NULL) + { + return REG_ESPACE; + } + assert(new_size > s->size); + s->size = new_size; + s->stack = new_buffer; + tre_stack_push(s, value); + } + } + return REG_OK; +} + +#define define_pushf(typetag, type) \ + declare_pushf(typetag, type) { \ + union tre_stack_item item; \ + item.typetag ## _value = value; \ + return tre_stack_push(s, item); \ +} + +define_pushf(int, int) +define_pushf(voidptr, void *) + +#define define_popf(typetag, type) \ + declare_popf(typetag, type) { \ + return s->stack[--s->ptr].typetag ## _value; \ + } + +define_popf(int, int) +define_popf(voidptr, void *) + + +/*********************************************************************** + from tre-parse.c and tre-parse.h +***********************************************************************/ + +/* Parse context. */ +typedef struct { + /* Memory allocator. The AST is allocated using this. */ + tre_mem_t mem; + /* Stack used for keeping track of regexp syntax. */ + tre_stack_t *stack; + /* The parse result. */ + tre_ast_node_t *result; + /* The regexp to parse and its length. */ + const char *re; + /* The first character of the entire regexp. */ + const char *re_start; + /* Current submatch ID. */ + int submatch_id; + /* Current position (number of literal). */ + int position; + /* The highest back reference or -1 if none seen so far. */ + int max_backref; + /* This flag is set if the regexp uses approximate matching. */ + int have_approx; + /* Compilation flags. */ + int cflags; + /* If this flag is set the top-level submatch is not captured. */ + int nofirstsub; +} tre_parse_ctx_t; + +/* Parses a wide character regexp pattern into a syntax tree. This parser + handles both syntaxes (BRE and ERE), including the TRE extensions. */ +static reg_errcode_t +tre_parse(tre_parse_ctx_t *ctx); + + +/* + This parser is just a simple recursive descent parser for POSIX.2 + regexps. The parser supports both the obsolete default syntax and + the "extended" syntax, and some nonstandard extensions. +*/ + +/* Characters with special meanings in regexp syntax. */ +#define CHAR_PIPE '|' +#define CHAR_LPAREN '(' +#define CHAR_RPAREN ')' +#define CHAR_LBRACE '{' +#define CHAR_RBRACE '}' +#define CHAR_LBRACKET '[' +#define CHAR_RBRACKET ']' +#define CHAR_MINUS '-' +#define CHAR_STAR '*' +#define CHAR_QUESTIONMARK '?' +#define CHAR_PLUS '+' +#define CHAR_PERIOD '.' +#define CHAR_COLON ':' +#define CHAR_EQUAL '=' +#define CHAR_COMMA ',' +#define CHAR_CARET '^' +#define CHAR_DOLLAR '$' +#define CHAR_BACKSLASH '\\' +#define CHAR_HASH '#' +#define CHAR_TILDE '~' + + +/* Some macros for expanding \w, \s, etc. */ +static const struct tre_macro_struct { + const char c; + const char *expansion; +} tre_macros[] = + { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, + {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, + {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, + {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, + { 0, NULL } + }; + + +/* Expands a macro delimited by `regex' and `regex_end' to `buf', which + must have at least `len' items. Sets buf[0] to zero if the there + is no match in `tre_macros'. */ +static const char * +tre_expand_macro(const char *regex) +{ + int i; + + if (!*regex) + return 0; + + for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++); + return tre_macros[i].expansion; +} + +static reg_errcode_t +tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, + tre_ast_node_t ***items) +{ + reg_errcode_t status; + tre_ast_node_t **array = *items; + /* Allocate more space if necessary. */ + if (*i >= *max_i) + { + tre_ast_node_t **new_items; + /* If the array is already 1024 items large, give up -- there's + probably an error in the regexp (e.g. not a '\0' terminated + string and missing ']') */ + if (*max_i > 1024) + return REG_ESPACE; + *max_i *= 2; + new_items = xrealloc(array, sizeof(*items) * *max_i); + if (new_items == NULL) + return REG_ESPACE; + *items = array = new_items; + } + array[*i] = tre_ast_new_literal(mem, min, max, -1); + status = array[*i] == NULL ? REG_ESPACE : REG_OK; + (*i)++; + return status; +} + + +static int +tre_compare_items(const void *a, const void *b) +{ + const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; + const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; + tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; + int a_min = l_a->code_min, b_min = l_b->code_min; + + if (a_min < b_min) + return -1; + else if (a_min > b_min) + return 1; + else + return 0; +} + +/* Maximum number of character classes that can occur in a negated bracket + expression. */ +#define MAX_NEG_CLASSES 64 + +/* Maximum length of character class names. */ +#define MAX_CLASS_NAME + +static reg_errcode_t +tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, + tre_ctype_t neg_classes[], int *num_neg_classes, + tre_ast_node_t ***items, int *num_items, + int *items_size) +{ + const char *re = ctx->re; + reg_errcode_t status = REG_OK; + tre_ctype_t class = (tre_ctype_t)0; + int i = *num_items; + int max_i = *items_size; + int skip; + + /* Build an array of the items in the bracket expression. */ + while (status == REG_OK) + { + skip = 0; + if (!*re) + { + status = REG_EBRACK; + } + else if (*re == CHAR_RBRACKET && re > ctx->re) + { + re++; + break; + } + else + { + tre_cint_t min = 0, max = 0; + wchar_t wc; + int clen = mbtowc(&wc, re, -1); + + if (clen<0) clen=1, wc=WEOF; + + class = (tre_ctype_t)0; + if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET) + { + min = wc; + re += clen+1; + clen = mbtowc(&wc, re, -1); + if (clen<0) clen=1, wc=WEOF; + max = wc; + re += clen; + /* XXX - Should use collation order instead of encoding values + in character ranges. */ + if (min > max) + status = REG_ERANGE; + } + else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) + status = REG_ECOLLATE; + else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) + status = REG_ECOLLATE; + else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) + { + char tmp_str[64]; + const char *endptr = re + 2; + int len; + while (*endptr && *endptr != CHAR_COLON) + endptr++; + if (*endptr) + { + len = MIN(endptr - re - 2, 63); + strncpy(tmp_str, re + 2, len); + tmp_str[len] = '\0'; + class = tre_ctype(tmp_str); + if (!class) + status = REG_ECTYPE; + re = endptr + 2; + } + else + status = REG_ECTYPE; + min = 0; + max = TRE_CHAR_MAX; + } + else + { + if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET + && ctx->re != re) + /* Two ranges are not allowed to share and endpoint. */ + status = REG_ERANGE; + min = max = wc; + re += clen; + } + + if (status != REG_OK) + break; + + if (class && negate) + if (*num_neg_classes >= MAX_NEG_CLASSES) + status = REG_ESPACE; + else + neg_classes[(*num_neg_classes)++] = class; + else if (!skip) + { + status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); + if (status != REG_OK) + break; + ((tre_literal_t*)((*items)[i-1])->obj)->class = class; + } + + /* Add opposite-case counterpoints if REG_ICASE is present. + This is broken if there are more than two "same" characters. */ + if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) + { + tre_cint_t cmin, ccurr; + + while (min <= max) + { + if (tre_islower(min)) + { + cmin = ccurr = tre_toupper(min++); + while (tre_islower(min) && tre_toupper(min) == ccurr + 1 + && min <= max) + ccurr = tre_toupper(min++); + status = tre_new_item(ctx->mem, cmin, ccurr, + &i, &max_i, items); + } + else if (tre_isupper(min)) + { + cmin = ccurr = tre_tolower(min++); + while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 + && min <= max) + ccurr = tre_tolower(min++); + status = tre_new_item(ctx->mem, cmin, ccurr, + &i, &max_i, items); + } + else min++; + if (status != REG_OK) + break; + } + if (status != REG_OK) + break; + } + } + } + *num_items = i; + *items_size = max_i; + ctx->re = re; + return status; +} + +static reg_errcode_t +tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +{ + tre_ast_node_t *node = NULL; + int negate = 0; + reg_errcode_t status = REG_OK; + tre_ast_node_t **items, *u, *n; + int i = 0, j, max_i = 32, curr_max, curr_min; + tre_ctype_t neg_classes[MAX_NEG_CLASSES]; + int num_neg_classes = 0; + + /* Start off with an array of `max_i' elements. */ + items = xmalloc(sizeof(*items) * max_i); + if (items == NULL) + return REG_ESPACE; + + if (*ctx->re == CHAR_CARET) + { + negate = 1; + ctx->re++; + } + + status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, + &items, &i, &max_i); + + if (status != REG_OK) + goto parse_bracket_done; + + /* Sort the array if we need to negate it. */ + if (negate) + qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); + + curr_max = curr_min = 0; + /* Build a union of the items in the array, negated if necessary. */ + for (j = 0; j < i && status == REG_OK; j++) + { + int min, max; + tre_literal_t *l = items[j]->obj; + min = l->code_min; + max = l->code_max; + + if (negate) + { + if (min < curr_max) + { + /* Overlap. */ + curr_max = MAX(max + 1, curr_max); + l = NULL; + } + else + { + /* No overlap. */ + curr_max = min - 1; + if (curr_max >= curr_min) + { + l->code_min = curr_min; + l->code_max = curr_max; + } + else + { + l = NULL; + } + curr_min = curr_max = max + 1; + } + } + + if (l != NULL) + { + int k; + l->position = ctx->position; + if (num_neg_classes > 0) + { + l->neg_classes = tre_mem_alloc(ctx->mem, + (sizeof(l->neg_classes) + * (num_neg_classes + 1))); + if (l->neg_classes == NULL) + { + status = REG_ESPACE; + break; + } + for (k = 0; k < num_neg_classes; k++) + l->neg_classes[k] = neg_classes[k]; + l->neg_classes[k] = (tre_ctype_t)0; + } + else + l->neg_classes = NULL; + if (node == NULL) + node = items[j]; + else + { + u = tre_ast_new_union(ctx->mem, node, items[j]); + if (u == NULL) + status = REG_ESPACE; + node = u; + } + } + } + + if (status != REG_OK) + goto parse_bracket_done; + + if (negate) + { + int k; + n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); + if (n == NULL) + status = REG_ESPACE; + else + { + tre_literal_t *l = n->obj; + if (num_neg_classes > 0) + { + l->neg_classes = tre_mem_alloc(ctx->mem, + (sizeof(l->neg_classes) + * (num_neg_classes + 1))); + if (l->neg_classes == NULL) + { + status = REG_ESPACE; + goto parse_bracket_done; + } + for (k = 0; k < num_neg_classes; k++) + l->neg_classes[k] = neg_classes[k]; + l->neg_classes[k] = (tre_ctype_t)0; + } + else + l->neg_classes = NULL; + if (node == NULL) + node = n; + else + { + u = tre_ast_new_union(ctx->mem, node, n); + if (u == NULL) + status = REG_ESPACE; + node = u; + } + } + } + + if (status != REG_OK) + goto parse_bracket_done; + +#ifdef TRE_DEBUG + tre_ast_print(node); +#endif /* TRE_DEBUG */ + + parse_bracket_done: + xfree(items); + ctx->position++; + *result = node; + return status; +} + + +/* Parses a positive decimal integer. Returns -1 if the string does not + contain a valid number. */ +static int +tre_parse_int(const char **regex) +{ + int num = -1; + const char *r = *regex; + while (*r-'0'<10U) + { + if (num < 0) + num = 0; + num = num * 10 + *r - '0'; + r++; + } + *regex = r; + return num; +} + + +static reg_errcode_t +tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +{ + int min, max; + const char *r = ctx->re; + int minimal = 0; + + /* Parse number (minimum repetition count). */ + min = -1; + if (*r >= '0' && *r <= '9') { + min = tre_parse_int(&r); + } + + /* Parse comma and second number (maximum repetition count). */ + max = min; + if (*r == CHAR_COMMA) + { + r++; + max = tre_parse_int(&r); + } + + /* Check that the repeat counts are sane. */ + if ((max >= 0 && min > max) || max > RE_DUP_MAX) + return REG_BADBR; + + /* Missing }. */ + if (!*r) + return REG_EBRACE; + + /* Empty contents of {}. */ + if (r == ctx->re) + return REG_BADBR; + + /* Parse the ending '}' or '\}'.*/ + if (ctx->cflags & REG_EXTENDED) + { + if (*r != CHAR_RBRACE) + return REG_BADBR; + r++; + } + else + { + if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE) + return REG_BADBR; + r += 2; + } + + /* Create the AST node(s). */ + if (min == 0 && max == 0) + { + *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + if (*result == NULL) + return REG_ESPACE; + } + else + { + if (min < 0 && max < 0) + /* Only approximate parameters set, no repetitions. */ + min = max = 1; + + *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); + if (!*result) + return REG_ESPACE; + } + + ctx->re = r; + return REG_OK; +} + +typedef enum { + PARSE_RE = 0, + PARSE_ATOM, + PARSE_MARK_FOR_SUBMATCH, + PARSE_BRANCH, + PARSE_PIECE, + PARSE_CATENATION, + PARSE_POST_CATENATION, + PARSE_UNION, + PARSE_POST_UNION, + PARSE_POSTFIX, + PARSE_RESTORE_CFLAGS +} tre_parse_re_stack_symbol_t; + + +static reg_errcode_t +tre_parse(tre_parse_ctx_t *ctx) +{ + tre_ast_node_t *result = NULL; + tre_parse_re_stack_symbol_t symbol; + reg_errcode_t status = REG_OK; + tre_stack_t *stack = ctx->stack; + int bottom = tre_stack_num_objects(stack); + int depth = 0; + wchar_t wc; + int clen; + + if (!ctx->nofirstsub) + { + STACK_PUSH(stack, int, ctx->submatch_id); + STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); + ctx->submatch_id++; + } + STACK_PUSH(stack, int, PARSE_RE); + ctx->re_start = ctx->re; + + + /* The following is basically just a recursive descent parser. I use + an explicit stack instead of recursive functions mostly because of + two reasons: compatibility with systems which have an overflowable + call stack, and efficiency (both in lines of code and speed). */ + while (tre_stack_num_objects(stack) > bottom && status == REG_OK) + { + if (status != REG_OK) + break; + symbol = tre_stack_pop_int(stack); + switch (symbol) + { + case PARSE_RE: + /* Parse a full regexp. A regexp is one or more branches, + separated by the union operator `|'. */ + if (ctx->cflags & REG_EXTENDED) + STACK_PUSHX(stack, int, PARSE_UNION); + STACK_PUSHX(stack, int, PARSE_BRANCH); + break; + + case PARSE_BRANCH: + /* Parse a branch. A branch is one or more pieces, concatenated. + A piece is an atom possibly followed by a postfix operator. */ + STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, int, PARSE_PIECE); + break; + + case PARSE_PIECE: + /* Parse a piece. A piece is an atom possibly followed by one + or more postfix operators. */ + STACK_PUSHX(stack, int, PARSE_POSTFIX); + STACK_PUSHX(stack, int, PARSE_ATOM); + break; + + case PARSE_CATENATION: + /* If the expression has not ended, parse another piece. */ + { + tre_char_t c; + if (!*ctx->re) + break; + c = *ctx->re; + if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) + break; + if ((ctx->cflags & REG_EXTENDED + && c == CHAR_RPAREN && depth > 0) + || (!(ctx->cflags & REG_EXTENDED) + && (c == CHAR_BACKSLASH + && *(ctx->re + 1) == CHAR_RPAREN))) + { + if (!(ctx->cflags & REG_EXTENDED) && depth == 0) + status = REG_EPAREN; + depth--; + if (!(ctx->cflags & REG_EXTENDED)) + ctx->re += 2; + break; + } + + { + /* Default case, left associative concatenation. */ + STACK_PUSHX(stack, int, PARSE_CATENATION); + STACK_PUSHX(stack, voidptr, result); + STACK_PUSHX(stack, int, PARSE_POST_CATENATION); + STACK_PUSHX(stack, int, PARSE_PIECE); + } + break; + } + + case PARSE_POST_CATENATION: + { + tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); + tre_ast_node_t *tmp_node; + tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); + if (!tmp_node) + return REG_ESPACE; + result = tmp_node; + break; + } + + case PARSE_UNION: + switch (*ctx->re) + { + case CHAR_PIPE: + STACK_PUSHX(stack, int, PARSE_UNION); + STACK_PUSHX(stack, voidptr, result); + STACK_PUSHX(stack, int, PARSE_POST_UNION); + STACK_PUSHX(stack, int, PARSE_BRANCH); + ctx->re++; + break; + + case CHAR_RPAREN: + ctx->re++; + break; + + default: + break; + } + break; + + case PARSE_POST_UNION: + { + tre_ast_node_t *tmp_node; + tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); + tmp_node = tre_ast_new_union(ctx->mem, tree, result); + if (!tmp_node) + return REG_ESPACE; + result = tmp_node; + break; + } + + case PARSE_POSTFIX: + /* Parse postfix operators. */ + switch (*ctx->re) + { + case CHAR_PLUS: + case CHAR_QUESTIONMARK: + if (!(ctx->cflags & REG_EXTENDED)) + break; + /*FALLTHROUGH*/ + case CHAR_STAR: + { + tre_ast_node_t *tmp_node; + int minimal = 0; + int rep_min = 0; + int rep_max = -1; + + if (*ctx->re == CHAR_PLUS) + rep_min = 1; + if (*ctx->re == CHAR_QUESTIONMARK) + rep_max = 1; + + ctx->re++; + tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, + minimal); + if (tmp_node == NULL) + return REG_ESPACE; + result = tmp_node; + STACK_PUSHX(stack, int, PARSE_POSTFIX); + } + break; + + case CHAR_BACKSLASH: + /* "\{" is special without REG_EXTENDED */ + if (!(ctx->cflags & REG_EXTENDED) + && *(ctx->re + 1) == CHAR_LBRACE) + { + ctx->re++; + goto parse_brace; + } + else + break; + + case CHAR_LBRACE: + /* "{" is literal without REG_EXTENDED */ + if (!(ctx->cflags & REG_EXTENDED)) + break; + + parse_brace: + ctx->re++; + + status = tre_parse_bound(ctx, &result); + if (status != REG_OK) + return status; + STACK_PUSHX(stack, int, PARSE_POSTFIX); + break; + } + break; + + case PARSE_ATOM: + /* Parse an atom. An atom is a regular expression enclosed in `()', + an empty set of `()', a bracket expression, `.', `^', `$', + a `\' followed by a character, or a single character. */ + + switch (*ctx->re) + { + case CHAR_LPAREN: /* parenthesized subexpression */ + + if (ctx->cflags & REG_EXTENDED) + { + lparen: + depth++; + { + ctx->re++; + /* First parse a whole RE, then mark the resulting tree + for submatching. */ + STACK_PUSHX(stack, int, ctx->submatch_id); + STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); + STACK_PUSHX(stack, int, PARSE_RE); + ctx->submatch_id++; + } + } + else + goto parse_literal; + break; + + case CHAR_LBRACKET: /* bracket expression */ + ctx->re++; + status = tre_parse_bracket(ctx, &result); + if (status != REG_OK) + return status; + break; + + case CHAR_BACKSLASH: + /* If this is "\(" or "\)" chew off the backslash and + try again. */ + if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN) + { + ctx->re++; + goto lparen; + } + if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re |